In [20]:
# Imports 
import pandas as pd
import numpy as np
import time
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import operator

In [21]:
# load the data, keep only selected columns
# Read the file using super fast Pandas.read_csv
def load_required_data(path, required_columns):
    dataframe = pd.read_json(path, lines=True)
    dataframe = dataframe[required_columns]
    return dataframe, dataframe.values

In [22]:
dataframe, values = load_required_data('train_15000.json', ["asin", "reviewerID", "overall"])

In [23]:
dataframe[:10]

Unnamed: 0,asin,reviewerID,overall
0,B00ANT8OF6,A00100742Q4O8VH0YMUBZ,4
1,B00AMR1HZ8,A00100742Q4O8VH0YMUBZ,4
2,B004WGGQPQ,A00100742Q4O8VH0YMUBZ,5
3,B006C1ZSO4,A00100742Q4O8VH0YMUBZ,5
4,B008LY1B32,A00100742Q4O8VH0YMUBZ,4
5,B007KPT2N4,A00100742Q4O8VH0YMUBZ,1
6,B0080JJLBW,A00100742Q4O8VH0YMUBZ,5
7,B009P8EMCK,A00100742Q4O8VH0YMUBZ,5
8,B00I8Q77Y0,A001619027H9L9EG4UVRB,5
9,B00H0BGCJK,A001619027H9L9EG4UVRB,5


In [24]:
dataframe.shape

(15000, 3)

In [25]:
# Create user-item matrix given a matrix and mapping from original matrix columns to new matrix rows/columns
def create_user_item_matrix(data, rowMapping, columnMapping):
    # Create table of users to items by taking row and column pivots
    # Rows of user-item table are users
    rows, row_pos = np.unique(data[:, rowMapping], return_inverse=True)
    # Columns of user-item table are movies
    cols, col_pos = np.unique(data[:, columnMapping], return_inverse=True)

    pivot_table = np.zeros((len(rows), len(cols)), dtype=data.dtype)
    pivot_table[row_pos, col_pos] = data[:, 2]
    return pivot_table, rows, cols

In [26]:
ratings_matrix, rows, cols = create_user_item_matrix(values, rowMapping=1, columnMapping=0)
print (ratings_matrix.shape)

(3139, 1842)


In [27]:
pd.DataFrame(ratings_matrix[:10], index = rows[:10], columns= cols)

Unnamed: 0,B004ALVL6W,B004ANMWPY,B004DLNC4I,B004DLPXAO,B004DM1OAQ,B004DM1ZQY,B004DPBGCO,B004DPC5Y2,B004DPCSKI,B004DPIEF6,...,B00K7WGUKA,B00KFNXUY0,B00KGCNRAM,B00KI5Q8X0,B00KMX5V8G,B00KOEHQCW,B00KQHVWWC,B00KSOQ66K,B00KWVZ750,B00L3MNCNQ
A00100742Q4O8VH0YMUBZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A001619027H9L9EG4UVRB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A002359833QJM7OQHCXWY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0037670NPLI11RBWYFA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A003841815JTX0JFLR8B1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0038872349TB5N0JHQQW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00455683H6M1GQZMPQPV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,0
A0047670XAAJD587LXS7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00501041JRAPWYLPQ4TE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00507662MEMHI1YMGQ15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
def get_index_map(arr):
    index_map = {}
    for el in arr:
        index_map[el] = len(index_map)
    return index_map

In [29]:
# Get unique movies and users
unique_items = np.unique(values[:,0])
unique_users = np.unique(values[:,1])

In [30]:
unique_items.shape

(1842,)

In [31]:
unique_users.shape

(3139,)

In [32]:
# Create mapping from unique list to 
item_idx_map = get_index_map(unique_items)
user_idx_map = get_index_map(unique_users)

In [33]:
mean_users = np.true_divide(ratings_matrix.sum(1), (ratings_matrix!=0).sum(1))
mean_items = np.true_divide(ratings_matrix.sum(0),(ratings_matrix!=0).sum(0))
mean_overall = np.true_divide(ratings_matrix.sum(),(ratings_matrix!=0).sum())

In [39]:
def get_users_who_rated_item(item_id):
    return np.where(ratings_matrix[:,item_idx_map[item_id]] > 0)[0]

def get_users_who_rated_item_by_index(item_idx):
    return np.where(ratings_matrix[:,item_idx] > 0)

def predict(user_id, item_id):
    item_idx = item_idx_map[item_id]
    user_idx = user_idx_map[user_id]
    bias_user = mean_users[user_idx] - mean_overall
    bias_item = mean_items[item_idx] - mean_overall
    return mean_overall + bias_user + bias_item

In [40]:
test_dataframe, test_values = load_required_data('test_5000.json', ["asin", "reviewerID", "overall"])

In [41]:
test_dataframe['reviewerID'].unique().shape

(2507,)

In [42]:
# Returns DF containing predictions, MAE, RMSE
def get_predictions(dataframe):
    df = dataframe.copy()
    print ("Predicting ratings..")
    start_time = time.time()
    df['Prediction'] = df.apply(lambda row: predict_row(row) , axis=1)
    MAE = round(mean_absolute_error(df['overall'], df['Prediction']), 2)
    RMSE = round(np.sqrt(mean_squared_error(df['overall'], df['Prediction'])), 2)
    print "Done Predicting in",  round((time.time() - start_time), 2), "seconds"
    return df, MAE, RMSE

i = 0
def predict_row(row):
    global i
    i+= 1
    prediction = predict(row['reviewerID'], row['asin'])
    print ("#", i, "Actual = ", row['overall'], "Predictioned = ", prediction)
    return prediction

In [43]:
df, MAE, RMSE = get_predictions(test_dataframe)

Predicting ratings..
('#', 1, 'Actual = ', 5, 'Predictioned = ', 5.066866666666667)
('#', 2, 'Actual = ', 2, 'Predictioned = ', 3.289088888888889)
('#', 3, 'Actual = ', 5, 'Predictioned = ', 4.2752)
('#', 4, 'Actual = ', 5, 'Predictioned = ', 5.7196444444444445)
('#', 5, 'Actual = ', 5, 'Predictioned = ', 4.591866666666667)
('#', 6, 'Actual = ', 3, 'Predictioned = ', 4.8977490196078435)
('#', 7, 'Actual = ', 5, 'Predictioned = ', 3.941866666666667)
('#', 8, 'Actual = ', 4, 'Predictioned = ', 3.073445614035088)
('#', 9, 'Actual = ', 5, 'Predictioned = ', 5.513295238095238)
('#', 10, 'Actual = ', 5, 'Predictioned = ', 5.605024561403509)
('#', 11, 'Actual = ', 1, 'Predictioned = ', 2.641866666666667)
('#', 12, 'Actual = ', 4, 'Predictioned = ', 4.2752)
('#', 13, 'Actual = ', 5, 'Predictioned = ', 4.908533333333334)
('#', 14, 'Actual = ', 5, 'Predictioned = ', 4.841866666666666)
('#', 15, 'Actual = ', 4, 'Predictioned = ', 4.312237037037037)
('#', 16, 'Actual = ', 1, 'Predictioned = ', 4.1

('#', 746, 'Actual = ', 3, 'Predictioned = ', 1.7085333333333335)
('#', 747, 'Actual = ', 3, 'Predictioned = ', 2.2752)
('#', 748, 'Actual = ', 5, 'Predictioned = ', 4.7752)
('#', 749, 'Actual = ', 4, 'Predictioned = ', 5.441866666666667)
('#', 750, 'Actual = ', 4, 'Predictioned = ', 3.7752)
('#', 751, 'Actual = ', 3, 'Predictioned = ', 3.760048484848485)
('#', 752, 'Actual = ', 4, 'Predictioned = ', 4.32570505050505)
('#', 753, 'Actual = ', 3, 'Predictioned = ', 4.423348148148148)
('#', 754, 'Actual = ', 3, 'Predictioned = ', 4.016866666666667)
('#', 755, 'Actual = ', 1, 'Predictioned = ', 3.3085333333333335)
('#', 756, 'Actual = ', 5, 'Predictioned = ', 3.2752)
('#', 757, 'Actual = ', 4, 'Predictioned = ', 2.541866666666667)
('#', 758, 'Actual = ', 5, 'Predictioned = ', 4.7752)
('#', 759, 'Actual = ', 5, 'Predictioned = ', 5.350957575757576)
('#', 760, 'Actual = ', 5, 'Predictioned = ', 5.179961904761905)
('#', 761, 'Actual = ', 4, 'Predictioned = ', 5.253393146417446)
('#', 762, 'Ac

('#', 1752, 'Actual = ', 1, 'Predictioned = ', 3.2752)
('#', 1753, 'Actual = ', 5, 'Predictioned = ', 3.691866666666667)
('#', 1754, 'Actual = ', 5, 'Predictioned = ', 2.4180571428571436)
('#', 1755, 'Actual = ', 1, 'Predictioned = ', 1.9418666666666669)
('#', 1756, 'Actual = ', 5, 'Predictioned = ', 5.941866666666667)
('#', 1757, 'Actual = ', 5, 'Predictioned = ', 5.941866666666667)
('#', 1758, 'Actual = ', 5, 'Predictioned = ', 5.341866666666667)
('#', 1759, 'Actual = ', 5, 'Predictioned = ', 5.499243715846995)
('#', 1760, 'Actual = ', 5, 'Predictioned = ', 5.941866666666667)
('#', 1761, 'Actual = ', 2, 'Predictioned = ', 3.941866666666667)
('#', 1762, 'Actual = ', 4, 'Predictioned = ', 4.339302564102565)
('#', 1763, 'Actual = ', 4, 'Predictioned = ', 5.031610256410257)
('#', 1764, 'Actual = ', 5, 'Predictioned = ', 3.4402992685475446)
('#', 1765, 'Actual = ', 5, 'Predictioned = ', 4.328230303030303)
('#', 1766, 'Actual = ', 5, 'Predictioned = ', 4.321866666666667)
('#', 1767, 'Actua

('#', 3064, 'Actual = ', 3, 'Predictioned = ', 3.791866666666667)
('#', 3065, 'Actual = ', 4, 'Predictioned = ', 3.541866666666667)
('#', 3066, 'Actual = ', 5, 'Predictioned = ', 4.762379487179488)
('#', 3067, 'Actual = ', 4, 'Predictioned = ', 5.108533333333334)
('#', 3068, 'Actual = ', 3, 'Predictioned = ', 4.6323428571428575)
('#', 3069, 'Actual = ', 5, 'Predictioned = ', 3.674009523809524)
('#', 3070, 'Actual = ', 4, 'Predictioned = ', 3.9418666666666664)
('#', 3071, 'Actual = ', 4, 'Predictioned = ', 4.141866666666667)
('#', 3072, 'Actual = ', 4, 'Predictioned = ', 3.6149435897435898)
('#', 3073, 'Actual = ', 5, 'Predictioned = ', 4.446968707482993)
('#', 3074, 'Actual = ', 5, 'Predictioned = ', 4.108533333333334)
('#', 3075, 'Actual = ', 5, 'Predictioned = ', 4.0018666666666665)
('#', 3076, 'Actual = ', 3, 'Predictioned = ', 3.808533333333334)
('#', 3077, 'Actual = ', 5, 'Predictioned = ', 4.988533333333334)
('#', 3078, 'Actual = ', 5, 'Predictioned = ', 4.187480701754387)
('#', 

('#', 4264, 'Actual = ', 4, 'Predictioned = ', 5.608533333333334)
('#', 4265, 'Actual = ', 5, 'Predictioned = ', 5.0700717948717955)
('#', 4266, 'Actual = ', 5, 'Predictioned = ', 4.741866666666667)
('#', 4267, 'Actual = ', 5, 'Predictioned = ', 4.745438095238095)
('#', 4268, 'Actual = ', 5, 'Predictioned = ', 3.4835333333333334)
('#', 4269, 'Actual = ', 1, 'Predictioned = ', 4.896412121212121)
('#', 4270, 'Actual = ', 2, 'Predictioned = ', 4.383043137254902)
('#', 4271, 'Actual = ', 4, 'Predictioned = ', 5.063078787878788)
('#', 4272, 'Actual = ', 5, 'Predictioned = ', 5.348533333333334)
('#', 4273, 'Actual = ', 4, 'Predictioned = ', 3.941866666666667)
('#', 4274, 'Actual = ', 5, 'Predictioned = ', 3.441866666666667)
('#', 4275, 'Actual = ', 5, 'Predictioned = ', 3.938630420711974)
('#', 4276, 'Actual = ', 4, 'Predictioned = ', 4.608533333333334)
('#', 4277, 'Actual = ', 4, 'Predictioned = ', 5.2918666666666665)
('#', 4278, 'Actual = ', 5, 'Predictioned = ', 5.0252)
('#', 4279, 'Actua

In [44]:
print "MAE=", MAE
print "RMSE=", RMSE

MAE= 1.0
RMSE= 1.37


In [45]:
sparse_ratings = csr_matrix(ratings_matrix, dtype='float')

In [46]:
model_knn = NearestNeighbors(metric = 'cosine', n_neighbors = 6)
model_knn.fit(sparse_ratings)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=6, p=2, radius=1.0)

In [47]:
def get_recommendations(user_index, n_reccomendations=5, n_neighbors=10):
    distances,indices = model_knn.kneighbors(ratings_matrix[user_index].reshape(1, -1), n_neighbors=n_neighbors)
    probable_items = list()
    for user in indices[0]:
        if user == user_index:
            print "Recommendations for user ", unique_users[user_index]
            continue
        # all other users
        for items in np.nonzero(ratings_matrix[user]):
            for item in items:
                if ratings_matrix[user_index][item] == 0:
                    probable_items.append(unique_items[item])

    predictions = {}
    for item in probable_items:
        predictions[item] = predict(unique_users[0], item)
        sorted_predictions = sorted(predictions.items(), key=operator.itemgetter(1))
    return [i[0] for i in sorted_predictions[:n_reccomendations]]

In [48]:
def get_user_items_rated_df(user_id):
    df = pd.DataFrame()
    np.nonzero(ratings_matrix[user_id])
    for i in np.nonzero(ratings_matrix[user_id]):
        x = pd.DataFrame({"item":unique_items[i], "rating":ratings_matrix[user_id][i]})
        df = df.append(x)
        return df

def print_items_rated(user_id):
    np.nonzero(ratings_matrix[user_id])
    for i in unique_items[np.nonzero(ratings_matrix[user_id])]:
        print i

In [49]:
get_user_items_rated_df(0)

Unnamed: 0,item,rating
0,B004WGGQPQ,5
1,B006C1ZSO4,5
2,B007KPT2N4,1
3,B0080JJLBW,5
4,B008LY1B32,4
5,B009P8EMCK,5
6,B00AMR1HZ8,4
7,B00ANT8OF6,4


In [50]:
for i in get_recommendations(user_index=1):
    print i

Recommendations for user  A001619027H9L9EG4UVRB
B00529IOXO
B00G5LQ5MU
B00CZDSK7K
B008Y7SMQU
B00992CF6W
