In [1]:
# Import the correct algorithm
from surprise import Dataset, Reader, accuracy, SVD, CoClustering, SlopeOne 
from surprise.accuracy import rmse
from surprise.model_selection import GridSearchCV, train_test_split
import pandas as pd
import time
import pathlib
import numpy as np

#cross_validate() is a function to help give accuracy metric for a 
#given set of params
#GridSearchCV is meant to allow ntrying of diff combinations of params

In [2]:
data_100k =  Dataset.load_builtin('ml-100k')
data_1m = Dataset.load_builtin('ml-1m')
path = pathlib.Path('ml-10M100K').resolve() / 'ratings.dat'
reader = Reader(line_format="user item rating timestamp",sep='::')
print(path)
data_10m = Dataset.load_from_file(file_path=path,reader=reader)

/Users/kohjunkai/Documents/GitHub/CZ4032/ml-10M100K/ratings.dat


In [3]:
# 'user item rating timestamp', separated by '::' characters.
# reader = Reader(line_format="user item rating timestamp",sep='::')

# param_grid = {"n_epochs": [10], 
#               "lr_all": [0.002,0.003,0.005], 
#               "reg_all": [0.01,0.02,0.03],
#               'biased':[True,False],'random_state':[1],
#               'verbose':[True]}

# param_grid = {"n_cltr_u": [2,3,4], 
#               "n_cltr_i": [2,3,4], 
#               "random_state":[1],
#               "verbose":[True]}

# gs_SVD = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_SlopeOne = GridSearchCV(SlopeOne, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering = GridSearchCV(CoClustering, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering.param_combinations

# gs_SVD.fit(data_100k)
# gs_SlopeOne.fit(data_100k)
# gs_CoClustering.fit(data_100k)
# print("SVD - 100k")
# print('RMSE_best_score:',gs_SVD.best_score["rmse"])
# print('RMSE_best_params',gs_SVD.best_params["rmse"])
# train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
# algo = gs.best_estimator["rmse"]
# algo.fit(train_data_100k)

In [4]:
svd = SVD()
co_clustering = CoClustering()
slope_one = SlopeOne()
algo_dict = {"SVD":svd,"CoClustering":co_clustering,"SlopeOne":slope_one}

In [5]:
# Test set is 25%
train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
train_data_1m, test_data_1m = train_test_split(data_1m, test_size=0.25, random_state=1)
# train_data_10m, test_data_10m = train_test_split(data_10m, test_size=0.25, random_state=1)

In [6]:
predictions = {}
for name,algo in algo_dict.items():
    begin = time.time()
    algo.fit(train_data_100k)
    predictions[name] = algo.test(test_data_100k)
    end = time.time()
    result = accuracy.rmse(predictions[name])
    print(f'{name}: {result}, time: {round(end-begin,2)}')
    print()

RMSE: 0.9436
SVD: 0.9436080282685875, time: (1, 2)

RMSE: 0.9739
CoClustering: 0.9739038844830642, time: (1, 2)

RMSE: 0.9477
SlopeOne: 0.9477498649038465, time: (3, 2)



In [7]:
test_data_100k = pd.DataFrame(test_data_100k)
test_data_100k = test_data_100k.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating"
})

In [None]:
pred_dictionary={}
co_clustering_pred_array = []
slope_one_pred_array = []

for name,algo in algo_dict.items():
    begin = time.time()
    pred_dictionary[name] = []
    for i in range(len(test_data_100k)):
        y_pred = algo.predict(test_data_100k.loc[i]['uid'], test_data_100k.loc[i]['iid'])
        y_pred = np.array(y_pred)
        y_pred[2] = test_data_100k.loc[i]['rating']
        pred_dictionary[name].append(y_pred)
    end = time.time() 
    print(name)
    print(f"time taken to generate test predictions = {round(end-begin, 2)} seconds")
    print()
    


SVD
time taken to generate test predictions = 5.01 seconds



In [None]:
df_dict = {}
for name,array in pred_dictionary.items():
    df_dict[name] = pd.DataFrame(pred_dictionary[name])
    df_dict[name] = df_dict[name].rename(columns={
        0: "uid",
        1: "iid",
        2: "actual rating",
        3: "predited rating",
        4: "-"
    })
#     df_dict[name]['err'] = abs(df_dict[name].est-df_dict[name].r_ui)


In [None]:
df_dict["SVD"]

In [None]:
df_dict["CoClustering"]

In [None]:
df_dict["SlopeOne"]