In [15]:
# Import the correct algorithm
from surprise import Dataset, Reader, accuracy, SVD, CoClustering, SlopeOne 
from surprise.accuracy import rmse
from surprise.model_selection import GridSearchCV, train_test_split
import pandas as pd
import time
import pathlib
import numpy as np

#cross_validate() is a function to help give accuracy metric for a 
#given set of params
#GridSearchCV is meant to allow ntrying of diff combinations of params

In [28]:
data_100k =  Dataset.load_builtin('ml-100k')
data_1m = Dataset.load_builtin('ml-1m')
path = pathlib.Path('ml-10M100K').resolve() / 'ratings.dat'
reader = Reader(line_format="user item rating timestamp",sep='::')
print(path)
data_10m = Dataset.load_from_file(file_path=path,reader=reader)

/Users/kohjunkai/Documents/GitHub/CZ4032/ml-10M100K/ratings.dat


In [29]:
# 'user item rating timestamp', separated by '::' characters.
# reader = Reader(line_format="user item rating timestamp",sep='::')

# param_grid = {"n_epochs": [10], 
#               "lr_all": [0.002,0.003,0.005], 
#               "reg_all": [0.01,0.02,0.03],
#               'biased':[True,False],'random_state':[1],
#               'verbose':[True]}

# param_grid = {"n_cltr_u": [2,3,4], 
#               "n_cltr_i": [2,3,4], 
#               "random_state":[1],
#               "verbose":[True]}

# gs_SVD = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_SlopeOne = GridSearchCV(SlopeOne, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering = GridSearchCV(CoClustering, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering.param_combinations

# gs_SVD.fit(data_100k)
# gs_SlopeOne.fit(data_100k)
# gs_CoClustering.fit(data_100k)
# print("SVD - 100k")
# print('RMSE_best_score:',gs_SVD.best_score["rmse"])
# print('RMSE_best_params',gs_SVD.best_params["rmse"])
# train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
# algo = gs.best_estimator["rmse"]
# algo.fit(train_data_100k)

In [9]:
svd = SVD()
co_clustering = CoClustering()
slope_one = SlopeOne()
algo_dict = {"SVD":svd,"CoClustering":co_clustering,"SlopeOne":slope_one}

In [10]:
# Test set is 25%
train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
train_data_1m, test_data_1m = train_test_split(data_1m, test_size=0.25, random_state=1)
# train_data_10m, test_data_10m = train_test_split(data_10m, test_size=0.25, random_state=1)

In [11]:
predictions = {}
for name,algo in algo_dict.items():
    begin = time.time()
    algo.fit(train_data_100k)
    predictions[name] = algo.test(test_data_100k)
    end = time.time()
    result = accuracy.rmse(predictions[name])
    print(f'{name}: {result}, time: {round(end-begin)}')
    print()

RMSE: 0.9437
SVD: 0.9436880694341191, time: 1

RMSE: 0.9691
CoClustering: 0.9690635812712349, time: 1

RMSE: 0.9477
SlopeOne: 0.9477498649038465, time: 3



In [13]:
test_data_100k = pd.DataFrame(test_data_100k)
test_data_100k = test_data_100k.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating"
})

In [16]:
pred_dictionary={}
co_clustering_pred_array = []
slope_one_pred_array = []

for name,algo in algo_dict.items():
    begin = time.time()
    pred_dictionary[name] = []
    for i in range(len(test_data_100k)):
        y_pred = algo.predict(test_data_100k.loc[i]['uid'], test_data_100k.loc[i]['iid'])
        y_pred = np.array(y_pred)
        y_pred[2] = test_data_100k.loc[i]['rating']
        pred_dictionary[name].append(y_pred)
    end = time.time() 
    print(name)
    print(f"time taken to generate test predictions = {round(end-begin, 2)} seconds")
    print()
    


SVD
time taken to generate test predictions = 7.21 seconds

CoClustering
time taken to generate test predictions = 6.62 seconds

SlopeOne
time taken to generate test predictions = 9.44 seconds



In [25]:
df_dict = {}
for name,array in pred_dictionary.items():
    df_dict[name] = pd.DataFrame(pred_dictionary[name])
    df_dict[name] = df_dict[name].rename(columns={
        0: "uid",
        1: "iid",
        2: "actual rating",
        3: "predited rating",
        4: "-"
    })
#     df_dict[name]['err'] = abs(df_dict[name].est-df_dict[name].r_ui)


In [26]:
df_dict["SVD"]

Unnamed: 0,uid,iid,actual rating,predited rating,-
0,345,715,4.0,3.666596,{'was_impossible': False}
1,92,998,2.0,2.673463,{'was_impossible': False}
2,934,195,4.0,3.722099,{'was_impossible': False}
3,586,423,2.0,4.006315,{'was_impossible': False}
4,336,383,1.0,2.232531,{'was_impossible': False}
...,...,...,...,...,...
24995,26,840,2.0,2.394011,{'was_impossible': False}
24996,625,198,4.0,3.718215,{'was_impossible': False}
24997,56,568,4.0,3.975254,{'was_impossible': False}
24998,882,172,5.0,4.953627,{'was_impossible': False}


In [27]:
df_dict["CoClustering"]

Unnamed: 0,uid,iid,actual rating,predited rating,-
0,345,715,4.0,3.525968,{'was_impossible': False}
1,92,998,2.0,1.924749,{'was_impossible': False}
2,934,195,4.0,4.288074,{'was_impossible': False}
3,586,423,2.0,3.788044,{'was_impossible': False}
4,336,383,1.0,1.426213,{'was_impossible': False}
...,...,...,...,...,...
24995,26,840,2.0,2.180194,{'was_impossible': False}
24996,625,198,4.0,3.647103,{'was_impossible': False}
24997,56,568,4.0,3.850164,{'was_impossible': False}
24998,882,172,5.0,4.925572,{'was_impossible': False}


In [28]:
df_dict["SlopeOne"]

Unnamed: 0,uid,iid,actual rating,predited rating,-
0,345,715,4.0,3.516656,{'was_impossible': False}
1,92,998,2.0,2.519391,{'was_impossible': False}
2,934,195,4.0,4.069356,{'was_impossible': False}
3,586,423,2.0,3.698388,{'was_impossible': False}
4,336,383,1.0,1.738678,{'was_impossible': False}
...,...,...,...,...,...
24995,26,840,2.0,2.302933,{'was_impossible': False}
24996,625,198,4.0,3.662768,{'was_impossible': False}
24997,56,568,4.0,3.835087,{'was_impossible': False}
24998,882,172,5.0,4.787234,{'was_impossible': False}
