In [38]:
# Imports 
import pandas as pd
import numpy as np
import time
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import operator
from collections import defaultdict

In [3]:
# load the data, keep only selected columns
# Read the file using super fast Pandas.read_csv
def load_required_data(path, required_columns):
    dataframe = pd.read_json(path, lines=True)
    dataframe = dataframe[required_columns]
    return dataframe, dataframe.values

In [4]:
dataframe, values = load_required_data('train_15000.json', ["asin", "reviewerID", "overall"])

In [5]:
dataframe[:10]

Unnamed: 0,asin,reviewerID,overall
0,B00ANT8OF6,A00100742Q4O8VH0YMUBZ,4
1,B00AMR1HZ8,A00100742Q4O8VH0YMUBZ,4
2,B004WGGQPQ,A00100742Q4O8VH0YMUBZ,5
3,B006C1ZSO4,A00100742Q4O8VH0YMUBZ,5
4,B008LY1B32,A00100742Q4O8VH0YMUBZ,4
5,B007KPT2N4,A00100742Q4O8VH0YMUBZ,1
6,B0080JJLBW,A00100742Q4O8VH0YMUBZ,5
7,B009P8EMCK,A00100742Q4O8VH0YMUBZ,5
8,B00I8Q77Y0,A001619027H9L9EG4UVRB,5
9,B00H0BGCJK,A001619027H9L9EG4UVRB,5


In [6]:
dataframe.shape

(15000, 3)

In [7]:
# Create user-item matrix given a matrix and mapping from original matrix columns to new matrix rows/columns
def create_user_item_matrix(data, rowMapping, columnMapping):
    # Create table of users to items by taking row and column pivots
    # Rows of user-item table are users
    rows, row_pos = np.unique(data[:, rowMapping], return_inverse=True)
    # Columns of user-item table are movies
    cols, col_pos = np.unique(data[:, columnMapping], return_inverse=True)

    pivot_table = np.zeros((len(rows), len(cols)), dtype=data.dtype)
    pivot_table[row_pos, col_pos] = data[:, 2]
    return pivot_table, rows, cols

In [8]:
ratings_matrix, rows, cols = create_user_item_matrix(values, rowMapping=1, columnMapping=0)
print (ratings_matrix.shape)

(3139, 1842)


In [9]:
pd.DataFrame(ratings_matrix[:10], index = rows[:10], columns= cols)

Unnamed: 0,B004ALVL6W,B004ANMWPY,B004DLNC4I,B004DLPXAO,B004DM1OAQ,B004DM1ZQY,B004DPBGCO,B004DPC5Y2,B004DPCSKI,B004DPIEF6,...,B00K7WGUKA,B00KFNXUY0,B00KGCNRAM,B00KI5Q8X0,B00KMX5V8G,B00KOEHQCW,B00KQHVWWC,B00KSOQ66K,B00KWVZ750,B00L3MNCNQ
A00100742Q4O8VH0YMUBZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A001619027H9L9EG4UVRB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A002359833QJM7OQHCXWY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0037670NPLI11RBWYFA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A003841815JTX0JFLR8B1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0038872349TB5N0JHQQW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00455683H6M1GQZMPQPV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,0
A0047670XAAJD587LXS7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00501041JRAPWYLPQ4TE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00507662MEMHI1YMGQ15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def get_index_map(arr):
    index_map = {}
    for el in arr:
        index_map[el] = len(index_map)
    return index_map

In [11]:
# Get unique movies and users
unique_items = np.unique(values[:,0])
unique_users = np.unique(values[:,1])

In [14]:
# Create mapping from unique list to 
item_idx_map = get_index_map(unique_items)
user_idx_map = get_index_map(unique_users)

In [16]:
test_dataframe, test_values = load_required_data('test_5000.json', ["asin", "reviewerID", "overall"])

In [60]:
def get_user_items_rated_df(user_id):
    df = pd.DataFrame()
    np.nonzero(ratings_matrix[user_id])
    for i in np.nonzero(ratings_matrix[user_id]):
        x = pd.DataFrame({"item":unique_items[i], "rating":ratings_matrix[user_id][i]})
        df = df.append(x)
        return df

def print_items_rated(user_id):
    np.nonzero(ratings_matrix[user_id])
    for i in unique_items[np.nonzero(ratings_matrix[user_id])]:
        print i

In [45]:
from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(dataframe[['reviewerID', 'asin', 'overall']], reader)
data_train = data.build_full_trainset()

In [46]:
algo = SVD()
algo.fit(data_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x107b78410>

In [59]:
data_test = Dataset.load_from_df(test_dataframe[['reviewerID', 'asin', 'overall']], reader)
testset = data_test.build_full_trainset().build_testset()

predictions = algo.test(testset)

In [60]:
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 1.1818
MAE:  0.9075


0.9074699375192018

In [61]:
def get_top_n(user, predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        if uid == user:
            top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [57]:
get_top_n(u'A13Q9FMHS483MD, pred)

[Prediction(uid=u'A13Q9FMHS483MD', iid=u'B004MC8CA2', r_ui=5.0, est=4.669553694444572, details={u'was_impossible': False}),
 Prediction(uid=u'A11NOHYMLI0Y1L', iid=u'B0093QQAEM', r_ui=5.0, est=4.55999053363123, details={u'was_impossible': False}),
 Prediction(uid=u'A1316FBN61E6XS', iid=u'B009PCBEPE', r_ui=2.0, est=4.255684814787538, details={u'was_impossible': False}),
 Prediction(uid=u'A107O88X9VTJXJ', iid=u'B00JVZ0DQG', r_ui=4.0, est=4.401010769235086, details={u'was_impossible': False}),
 Prediction(uid=u'A12U6CX0KUGR8T', iid=u'B008L29K6E', r_ui=4.0, est=4.020879941260944, details={u'was_impossible': False}),
 Prediction(uid=u'A10RH6B7URE1B', iid=u'B009UWL580', r_ui=4.0, est=4.2051289197499, details={u'was_impossible': False}),
 Prediction(uid=u'A099280716ZEH5UPWAN4A', iid=u'B00HFTHXMC', r_ui=3.0, est=4.204530518653084, details={u'was_impossible': False}),
 Prediction(uid=u'A02658263UUAM9CGW5U4S', iid=u'B00A9JG6RS', r_ui=3.0, est=3.8230195680081196, details={u'was_impossible': False}