# SI 671: Collaborative Filtering 
### Name: Liangyi Murong
### Kaggle name: mdjy
## Based on Surprise: https://surprise.readthedocs.io
## Credit to Mengyuan Gao who found that some "asin" in test set have only 9 digits and we believe they should have 10.

## 1)Load train file of gzip format and turn it into pandas dataframe 

In [62]:
import gzip
import json
import pandas as pd
import numpy as np
import time

In [63]:
with gzip.GzipFile('reviews.training.json.gz', 'r') as fin:    # 4. gzip
    json_bytes = fin.read()                      # 3. bytes (i.e. UTF-8)

json_str = json_bytes.decode('utf-8')            # 2. string (i.e. JSON)

In [64]:
data_lst = json_str.split('\n')[:-1]

In [65]:
overall =[]
asin = []
reviewerID = []
for each in data_lst:
    temp_json = json.loads(each)
    overall.append(temp_json['overall'])
    asin.append(temp_json['asin'])
    reviewerID.append(temp_json['reviewerID'])

In [66]:
df = pd.DataFrame({'overall': overall, 'asin': asin, 'reviewerID': reviewerID})

In [67]:
df = df[['reviewerID', 'asin', 'overall']]
df['asin'] = df.asin.apply(func = lambda x:x[1:])
df.head()

Unnamed: 0,reviewerID,asin,overall
0,AMFIPCYDYWGVT,0090SI56Y,4.0
1,A3G602Z4DWDZKS,00005JL99,5.0
2,A33BOYMVG3U58Y,00109KN0M,5.0
3,ANEDXRFDZDL18,00005JMPT,5.0
4,A1VN7IS16PY024,00005AAA9,4.0


In [68]:
df.overall.value_counts()

5.0    725477
4.0    306313
3.0    161016
1.0     83287
2.0     81933
Name: overall, dtype: int64

## 2)Create a dummy class for changing pandas dataframe Surprise dataset

In [69]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from surprise import dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import accuracy

In [70]:
reader = Reader(line_format='user item rating', rating_scale=(1, 5))

class MyDataset(dataset.DatasetAutoFolds):

    def __init__(self, df, reader):

        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
                            zip(df['reviewerID'], df['asin'], df['overall'])]
        self.reader=reader
        self.rating_scale=(1, 5)

## 3)Load dev file of gzip format and turn it into pandas dataframe 

In [71]:
with gzip.GzipFile('reviews.dev.json.gz', 'r') as fin:    # 4. gzip
    json_bytes = fin.read()                      # 3. bytes (i.e. UTF-8)

json_str = json_bytes.decode('utf-8')            # 2. string (i.e. JSON)

In [72]:
data_lst = json_str.split('\n')[:-1]

In [73]:
overall =[]
asin = []
reviewerID = []
for each in data_lst:
    temp_json = json.loads(each)
    overall.append(temp_json['overall'])
    asin.append(temp_json['asin'])
    reviewerID.append(temp_json['reviewerID'])

In [74]:
df_dev = pd.DataFrame({'overall': overall, 'asin': asin, 'reviewerID': reviewerID})

In [75]:
df_dev = df_dev[['reviewerID', 'asin', 'overall']]
df_dev['asin'] = df_dev.asin.apply(func = lambda x:x[1:])
df_dev.head()

Unnamed: 0,reviewerID,asin,overall
0,A34DNO6UAH67Z0,000CDSS22,5.0
1,A3APW42N5MRVWT,305186774,2.0
2,A20D9VGCF3P13L,004LWZW24,5.0
3,A82LIVYSX6WZ9,00001U0DM,3.0
4,A3LRKDF5WU4ZDO,00005JOZI,3.0


## 4) Trying out different algorithms from Surprise
- Train with the whole training set and test with dev set
- Using default parameters

In [76]:
from surprise import SVD, SVDpp, NMF, BaselineOnly, SlopeOne, CoClustering
data = MyDataset(df, reader)

In [91]:
def build_model(mdata, params={}, model_name='SVD'):
    """ data should be MyDataset object """
    now = time.time()

    trainset = mdata.build_full_trainset()

    if model_name == 'SVD':
        algorithm = SVD(**params)
    elif model_name == 'SVDpp':
        algorithm = SVDpp(**params)
    elif model_name == 'SlopeOne':
        algorithm = SlopeOne(**params)
    elif model_name == 'BaselineOnly':
        algorithm = BaselineOnly(**params)
    elif model_name == 'CoClustering':
        algorithm = CoClustering(**params)
    elif model_name =='NMF':
        algorithm = NMF(**params)

    algo = algorithm.fit(trainset)
    print('Takes %.2f seconds to train the model' % (time.time() - now))

    return algo

## SVD (closest to what we learned in class)

In [82]:
svd = build_model(data, model_name='SVD')

Takes 92.40 seconds to train the SVDpp


In [83]:
def manual_predict(x):
    pred = svd.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
res = df_dev.copy()
res['pred'] = df_dev.apply(manual_predict, axis=1)
rmse = ((res.pred - res.overall) ** 2).mean() ** .5
print('rmse: ', rmse)

rmse:  1.0227951828079425


## SVD++

In [99]:
svdpp = build_model(data, model_name='SVDpp')

Takes 2747.39 seconds to train the model


In [100]:
def manual_predict(x):
    pred = svdpp.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
res = df_dev.copy()
res['pred'] = df_dev.apply(manual_predict, axis=1)
rmse = ((res.pred - res.overall) ** 2).mean() ** .5
print('rmse: ', rmse)

rmse:  1.0135342251858188


## NMF and BaselineOnly ( these two combine to be SVD in Surprise implementation )

In [92]:
nmf = build_model(data, model_name='NMF')

Takes 143.70 seconds to train the model


In [93]:
def manual_predict(x):
    pred = nmf.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
res = df_dev.copy()
res['pred'] = df_dev.apply(manual_predict, axis=1)
rmse = ((res.pred - res.overall) ** 2).mean() ** .5
print('rmse: ', rmse)

rmse:  1.107629183136025


In [94]:
bo = build_model(data, model_name='BaselineOnly')

Estimating biases using als...
Takes 19.46 seconds to train the model


In [95]:
def manual_predict(x):
    pred = bo.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
res = df_dev.copy()
res['pred'] = df_dev.apply(manual_predict, axis=1)
rmse = ((res.pred - res.overall) ** 2).mean() ** .5
print('rmse: ', rmse)

rmse:  1.018883959285338


## SlopeOne

In [96]:
sl = build_model(data, model_name='SlopeOne')

MemoryError: 

## CoClustering

In [97]:
cc = build_model(data, model_name='CoClustering')

Takes 77.94 seconds to train the model


In [98]:
def manual_predict(x):
    pred = cc.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
res = df_dev.copy()
res['pred'] = df_dev.apply(manual_predict, axis=1)
rmse = ((res.pred - res.overall) ** 2).mean() ** .5
print('rmse: ', rmse)

rmse:  1.0572750272073037


## 5) GridSearch for parameter tuning

In [24]:
param_grid = {'n_factors': [15, 20, 25, 200], 'n_epochs': [20], 'lr_all': [0.010],
              'reg_all': [0.1, 0.15, 0.2], 'verbose':[True]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing

Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processin

Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processi

In [85]:
param_grid = {'n_factors': [5, 10, 15], 'n_epochs': [20, 30], 'lr_all': [0.009, 0.010],
              'reg_all': [0.1], 'verbose':[True]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=2)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8


 processing epoch 28
 processing epoch 29
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 

## Try ensembling with linear model using models in step 4). Didn't work

In [106]:
def manual_predict(x):
    pred = svd.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
svd_train_pred = df.apply(manual_predict, axis=1)
svd_dev_pred = df_dev.apply(manual_predict, axis=1)

In [107]:
svd_train_pred.head()

0    3.947063
1    4.445154
2    4.350865
3    3.640673
4    4.129857
dtype: float64

In [108]:
def manual_predict(x):
    pred = svdpp.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
svdpp_train_pred = df.apply(manual_predict, axis=1)
svdpp_dev_pred = df_dev.apply(manual_predict, axis=1)

In [109]:
def manual_predict(x):
    pred = bo.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
bo_train_pred = df.apply(manual_predict, axis=1)
bo_dev_pred = df_dev.apply(manual_predict, axis=1)

In [110]:
def manual_predict(x):
    pred = cc.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]
cc_train_pred = df.apply(manual_predict, axis=1)
cc_dev_pred = df_dev.apply(manual_predict, axis=1)

In [114]:
train_pred = cc_train_pred.to_frame(name='cc')
train_pred['svdpp'] = svdpp_train_pred
train_pred['bo'] = bo_train_pred
train_pred['svd'] = svd_train_pred

dev_pred = cc_dev_pred.to_frame(name='cc')
dev_pred['svdpp'] = svdpp_dev_pred
dev_pred['bo'] = bo_dev_pred
dev_pred['svd'] = svd_dev_pred

In [122]:
from sklearn.linear_model import LinearRegression
lasso = Lasso(alpha=0.5, max_iter=3000, fit_intercept=False)
lasso.fit(train_pred, df.overall)

Lasso(alpha=0.5, copy_X=True, fit_intercept=False, max_iter=3000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [123]:
ensembled_lasso = lasso.predict(dev_pred)
rmse = ((ensembled_lasso - df_dev.overall) ** 2).mean() ** .5
print('rmse: ', rmse)

rmse:  1.025091338593805


## 6) Concatenating train data and dev data for training algorithm

In [124]:
df_train = pd.concat([df, df_dev])

In [125]:
data = MyDataset(df_train, reader)

## SVDpp

In [37]:
from surprise import SVDpp

In [40]:
param = {'n_factors': 10, 'n_epochs': 20, 'lr_all': 0.009, 'reg_all': 0.15, 'verbose': True}
algo_pp = SVDpp(**param)

In [41]:
import time
now = time.time()
trainset = data.build_full_trainset()
algo_pp.fit(trainset)
print('Takes %.2f seconds to train the SVDpp' % (time.time() - now))

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
Takes 2195.05 seconds to train the SVDpp


## SVD (number of concept 200)

In [127]:
param = {'n_factors': 200, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.15, 'verbose': True}
algo_svd = SVD(**param)

In [128]:
now = time.time()
trainset = data.build_full_trainset()
algo_svd.fit(trainset)
print('Takes %.2f seconds to train the SVD' % (time.time() - now))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Takes 229.42 seconds to train the SVD


## SVD (number of concept 20)

In [162]:
param = {'n_factors': 20, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.15, 'verbose': True}
algo_svd = SVD(**param)

In [163]:
now = time.time()
trainset = data.build_full_trainset()
algo_svd.fit(trainset)
print('Takes %.2f seconds to train the SVD' % (time.time() - now))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Takes 67.81 seconds to train the SVD


## Load my trained SVDpp algorithm ( took about 45 mins)

In [152]:
from surprise import dump
svdppb = dump.load('svdpp_best1.b')

## 7) Make predictions ( I made prediction with 3 models, each of them was trained with different parameters (either SVD or SVDpp)

In [153]:
test_set = pd.read_csv('reviews.test.unlabeled.csv')
test_set['asin'] = test_set.asin.apply(func = lambda x:x[-9:])
test_set.head()

Unnamed: 0,datapointID,reviewerID,asin
0,85288b7fd23d48dcb4fd2c9b52a7fa3c,AT79BAVA063DG,0009UVCQC
1,06f33eaec5bb4c20857cc1f9aee60fb4,A2DAHERP7HYJGO,002ZG99TA
2,8f14a0d25996472d80a2e745b66f565a,A3NM0RAYSL6PA8,0001NBNDY
3,50095c59950e444eb2b35afb00009f44,A2KODQS5LJGHF8,304089767
4,abbbd3cd87d846b0a965ae7ce0ea1aaf,A2ULE2TYILL4BR,000056MOF


In [164]:
def manual_predict(x):
    pred = algo_svd.predict(uid=x['reviewerID'], iid=x['asin'])
    return pred[3]

In [165]:
res = test_set.copy()

In [166]:
res['overall'] = test_set.apply(manual_predict, axis=1)

In [167]:
res[['datapointID', 'overall']].to_csv('test_5.csv', index=False)

## 8) Finally, I did an ensembling by averaging the predictions, which improved the result for more than 2 base point.

## Further work: The parameters I chosed was possibly not the best. Best parameters highly depend on the length of the training set. I should have built a function based on GridSearch for testing models with dev set.