In [33]:
# Imports 
import string
import pandas as pd
import numpy as np
import time
from scipy.stats.stats import pearsonr 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [3]:
# load the data, keep only selected columns
# Read the file using super fast Pandas.read_csv
def load_required_data(path, required_columns):
    dataframe = pd.read_json(path, lines=True)
    dataframe = dataframe[required_columns]
    return dataframe, dataframe.values

In [4]:
dataframe, values = load_required_data('../train.json', ["asin", "reviewerID", "overall"])

In [5]:
dataframe[:10]

Unnamed: 0,asin,reviewerID,overall
0,B00AMR1HZ8,A00100742Q4O8VH0YMUBZ,4
1,B006C1ZSO4,A00100742Q4O8VH0YMUBZ,5
2,B00ANT8OF6,A00100742Q4O8VH0YMUBZ,4
3,B008LY1B32,A00100742Q4O8VH0YMUBZ,4
4,B0080JJLBW,A00100742Q4O8VH0YMUBZ,5
5,B007KPT2N4,A00100742Q4O8VH0YMUBZ,1
6,B00EQ2TYMS,A00100742Q4O8VH0YMUBZ,4
7,B007PW1BRC,A00100742Q4O8VH0YMUBZ,2
8,B009P8EMCK,A00100742Q4O8VH0YMUBZ,5
9,B006YUVTK0,A00100742Q4O8VH0YMUBZ,5


In [6]:
dataframe.shape

(665666, 3)

In [7]:
# Create user-item matrix given a matrix and mapping from original matrix columns to new matrix rows/columns
def create_user_item_matrix(data, rowMapping, columnMapping):
    # Create table of users to items by taking row and column pivots
    # Rows of user-item table are users
    rows, row_pos = np.unique(data[:, rowMapping], return_inverse=True)
    # Columns of user-item table are movies
    cols, col_pos = np.unique(data[:, columnMapping], return_inverse=True)

    pivot_table = np.zeros((len(rows), len(cols)), dtype=data.dtype)
    pivot_table[row_pos, col_pos] = data[:, 2]
    return pivot_table, rows, cols

In [9]:
ratings_matrix, rows, cols = create_user_item_matrix(values, rowMapping=1, columnMapping=0)
print (ratings_matrix.shape)

(87271, 13209)


In [12]:
pd.DataFrame(ratings_matrix[:10], index = rows[:10], columns= cols)

Unnamed: 0,B004A9SDD8,B004AFQAUA,B004AHBBPW,B004ALVL6W,B004AMAIZQ,B004AMDC86,B004ANC00Q,B004ANE2WU,B004ANMWPY,B004AZH4C8,...,B00LDNE3FG,B00LDNQHJ6,B00LEIT6K2,B00LFLOEYG,B00LIAMYCI,B00LITLOMK,B00LMLW8T2,B00LMLXTJK,B00LP1MVSW,B00LUEMK44
A00100742Q4O8VH0YMUBZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A001619027H9L9EG4UVRB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A002359833QJM7OQHCXWY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0037670NPLI11RBWYFA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A003841815JTX0JFLR8B1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0038872349TB5N0JHQQW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00455683H6M1GQZMPQPV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0047670XAAJD587LXS7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00501041JRAPWYLPQ4TE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00507662MEMHI1YMGQ15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
def get_index_map(arr):
    index_map = {}
    for el in arr:
        index_map[el] = len(index_map)
    return index_map

In [14]:
def pearson_coeff(target, others):
    return np.corrcoef(matrix[target], matrix[others])[0][1:]

In [15]:
# Get unique movies and users
unique_items = np.unique(values[:,0])
unique_users = np.unique(values[:,1])

In [16]:
unique_items.shape

(13209,)

In [17]:
unique_users.shape

(87271,)

In [18]:
# Create mapping from unique list to 
item_idx_map = get_index_map(unique_items)
user_idx_map = get_index_map(unique_users)

In [19]:
mean_ratings = np.true_divide(ratings_matrix.sum(1), (ratings_matrix!=0).sum(1))

In [20]:
K = 0.1

In [29]:
def get_users_who_rated_item(item_id):
    return np.where(ratings_matrix[:,item_idx_map[item_id]] > 0)[0]

def get_users_who_rated_item_by_index(item_idx):
    return np.where(ratings_matrix[:,item_idx] > 0)

def predict(user_id, item_id):
    item_idx = item_idx_map[item_id]
    user_idx = user_idx_map[user_id]
    other_users_idx = get_users_who_rated_item(item_id)
    if user_idx in other_users_idx:
        np.delete(other_users_idx, user_idx)
    pearson_correlation = [pearsonr(other_user, ratings_matrix[user_idx])[0] for other_user in ratings_matrix[other_users_idx]]
    mean_others = mean_ratings[other_users_idx]
    rating_others = ratings_matrix[other_users_idx,item_idx]
    prediction = mean_ratings[user_idx] + K * np.sum(pearson_correlation * (rating_others - mean_others))
    return prediction

In [23]:
predict(u'A0698182HBSA2D2MEX40', 'B004K44R74')

mean user =  3.33333333333
mean other =  3.58868265059
pearson =  [0.0167888572734455, -0.0006497015732684542, -0.002077295180652037, -0.001123092923200311, -0.0007796779679517869, -0.0013973590945584792, -0.0008269890243092583, -0.0008588432060162333]


3.3358391387334074

In [24]:
np.mean(mean_ratings)

4.007764456541003

In [26]:
test_dataframe, test_values = load_required_data('../test.json', ["asin", "reviewerID", "overall"])

In [27]:
test_dataframe['reviewerID'].unique().shape

(87271,)

In [34]:
# Returns DF containing predictions, MAE, RMSE
def get_predictions(dataframe):
    df = dataframe.copy()
    print ("Predicting ratings..")
    start_time = time.time()
    df['Prediction'] = df.apply(lambda row: predict_row(row), axis=1)
    MAE = round(mean_absolute_error(df['overall'], df['Prediction']), 2)
    RMSE = round(np.sqrt(mean_squared_error(df['overall'], df['Prediction'])), 2)
    print "Done Predicting in",  round((time.time() - start_time), 2), "seconds"
    return df, MAE, RMSE

def predict_row(row):
    prediction = predict(row['reviewerID'], row['asin'])
    print ("Actual = ", row['overall'], "Predictioned = ", prediction)
    return prediction

In [None]:
df, MAE, RMSE = get_predictions(test_dataframe)

Predicting ratings..
('Actual = ', 5, 'Predictioned = ', 4.013018801103645)
('Actual = ', 5, 'Predictioned = ', 5.084660540080265)
('Actual = ', 3, 'Predictioned = ', 4.438249332727059)
('Actual = ', 4, 'Predictioned = ', 3.7863946929984795)
('Actual = ', 5, 'Predictioned = ', 5.229468196676607)
('Actual = ', 5, 'Predictioned = ', 3.179725855269213)
('Actual = ', 5, 'Predictioned = ', 5.030768402130499)
('Actual = ', 3, 'Predictioned = ', 3.4117098961410965)
('Actual = ', 3, 'Predictioned = ', 3.99986259332239)
('Actual = ', 4, 'Predictioned = ', 3.971900906608292)
('Actual = ', 5, 'Predictioned = ', 5.016487908015378)
('Actual = ', 4, 'Predictioned = ', 4.219783278191484)
('Actual = ', 5, 'Predictioned = ', 3.9997574877840205)
('Actual = ', 5, 'Predictioned = ', 4.378683070387178)
('Actual = ', 1, 'Predictioned = ', 4.039080478566658)
('Actual = ', 1, 'Predictioned = ', 2.7229137371145704)
('Actual = ', 5, 'Predictioned = ', 3.8190208339595357)
('Actual = ', 5, 'Predictioned = ', 4.50

('Actual = ', 3, 'Predictioned = ', 4.360907911535661)
('Actual = ', 5, 'Predictioned = ', 4.098784537207565)
('Actual = ', 5, 'Predictioned = ', 4.6596501504133006)
('Actual = ', 5, 'Predictioned = ', 3.8899342763327027)
('Actual = ', 5, 'Predictioned = ', 4.859807104826084)
('Actual = ', 5, 'Predictioned = ', 5.006490257215074)
('Actual = ', 5, 'Predictioned = ', 5.285751037325175)
('Actual = ', 5, 'Predictioned = ', 4.6634438946663135)
('Actual = ', 4, 'Predictioned = ', 4.017314852208578)
('Actual = ', 5, 'Predictioned = ', 5.079303758232974)
('Actual = ', 5, 'Predictioned = ', 7.72415846570155)
('Actual = ', 4, 'Predictioned = ', 5.209131492646335)
('Actual = ', 5, 'Predictioned = ', 4.4594314234020285)
('Actual = ', 5, 'Predictioned = ', 3.7392597073370637)
('Actual = ', 1, 'Predictioned = ', 2.698521988877859)
('Actual = ', 1, 'Predictioned = ', 3.891248193716194)
('Actual = ', 1, 'Predictioned = ', 4.089813340121124)
('Actual = ', 4, 'Predictioned = ', 4.645027695604241)
('Actu

('Actual = ', 5, 'Predictioned = ', 4.116741636371142)
('Actual = ', 4, 'Predictioned = ', 3.4992466335523806)
('Actual = ', 5, 'Predictioned = ', 7.545280372001173)
('Actual = ', 5, 'Predictioned = ', 5.648420953833192)
('Actual = ', 5, 'Predictioned = ', 3.002392729399213)
('Actual = ', 5, 'Predictioned = ', 4.44779734538588)
('Actual = ', 5, 'Predictioned = ', 4.015525411541132)
('Actual = ', 1, 'Predictioned = ', 3.7961081042150355)
('Actual = ', 4, 'Predictioned = ', 3.280951239115884)
('Actual = ', 4, 'Predictioned = ', 4.54340586666185)
('Actual = ', 5, 'Predictioned = ', 4.5797122599803375)
('Actual = ', 1, 'Predictioned = ', 3.428499841964228)
('Actual = ', 5, 'Predictioned = ', 4.330138348342981)
('Actual = ', 5, 'Predictioned = ', 4.147587267935431)
('Actual = ', 2, 'Predictioned = ', 4.522133087526365)
('Actual = ', 5, 'Predictioned = ', 6.171227710109672)
('Actual = ', 5, 'Predictioned = ', 3.5747347534918914)
('Actual = ', 5, 'Predictioned = ', 4.946264722347891)
('Actual

('Actual = ', 1, 'Predictioned = ', 2.1467810709774637)
('Actual = ', 5, 'Predictioned = ', 4.262313466566806)
('Actual = ', 5, 'Predictioned = ', 4.987717623517629)
('Actual = ', 4, 'Predictioned = ', 4.7450228320826975)
('Actual = ', 5, 'Predictioned = ', 4.363314598674337)
('Actual = ', 4, 'Predictioned = ', 4.132399345810532)
('Actual = ', 1, 'Predictioned = ', 3.9680726943453792)
('Actual = ', 1, 'Predictioned = ', 4.693631087272766)
('Actual = ', 5, 'Predictioned = ', 4.793399648292735)
('Actual = ', 4, 'Predictioned = ', 3.425696474138412)
('Actual = ', 5, 'Predictioned = ', 3.0087914739132424)
('Actual = ', 5, 'Predictioned = ', 3.8017321543965257)
('Actual = ', 4, 'Predictioned = ', 5.157253759874128)
('Actual = ', 3, 'Predictioned = ', 3.2371314573077212)
('Actual = ', 5, 'Predictioned = ', 3.931857826342966)
('Actual = ', 4, 'Predictioned = ', 3.2148417070730897)
('Actual = ', 5, 'Predictioned = ', 4.455266742863776)
('Actual = ', 5, 'Predictioned = ', 5.002673287939445)
('A

In [None]:
MAE

In [None]:
RMSE