In [1]:
# Import the correct algorithm
from surprise import Dataset, Reader, accuracy, SVD, CoClustering, SlopeOne 
from surprise.accuracy import rmse
from surprise.model_selection import GridSearchCV, train_test_split
import pandas as pd
import time
import pathlib
import numpy as np

## Load Datasets

In [2]:
data_100k =  Dataset.load_builtin('ml-100k')
data_1m = Dataset.load_builtin('ml-1m')
path = pathlib.Path('ml-10M100K').resolve() / 'ratings.dat'
reader = Reader(line_format="user item rating timestamp",sep='::')
print(path)
data_10m = Dataset.load_from_file(file_path=path,reader=reader)

/Users/kohjunkai/Documents/GitHub/CZ4032/ml-10M100K/ratings.dat


In [3]:
# 'user item rating timestamp', separated by '::' characters.
# reader = Reader(line_format="user item rating timestamp",sep='::')

# param_grid = {"n_epochs": [10], 
#               "lr_all": [0.002,0.003,0.005], 
#               "reg_all": [0.01,0.02,0.03],
#               'biased':[True,False],'random_state':[1],
#               'verbose':[True]}

# param_grid = {"n_cltr_u": [2,3,4], 
#               "n_cltr_i": [2,3,4], 
#               "random_state":[1],
#               "verbose":[True]}

# gs_SVD = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_SlopeOne = GridSearchCV(SlopeOne, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering = GridSearchCV(CoClustering, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering.param_combinations

# gs_SVD.fit(data_100k)
# gs_SlopeOne.fit(data_100k)
# gs_CoClustering.fit(data_100k)
# print("SVD - 100k")
# print('RMSE_best_score:',gs_SVD.best_score["rmse"])
# print('RMSE_best_params',gs_SVD.best_params["rmse"])
# train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
# algo = gs.best_estimator["rmse"]
# algo.fit(train_data_100k)

## Initialise diff algo

In [4]:
svd = SVD()
co_clustering = CoClustering()
slope_one = SlopeOne()
algo_dict = {"SVD":svd,"CoClustering":co_clustering,"SlopeOne":slope_one}

## Split training and test data

In [5]:
# Test set is 25%
train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
train_data_1m, test_data_1m = train_test_split(data_1m, test_size=0.25, random_state=1)
# train_data_10m, test_data_10m = train_test_split(data_10m, test_size=0.25, random_state=1)

## Compare fit timing and prediction accuracy

In [6]:
predictions = {}

print("----------- 100k ---------------")
for name,algo in algo_dict.items():
    begin = time.time()
    algo.fit(train_data_100k)
    end = time.time()
    predictions[name] = algo.test(test_data_100k)
    result = accuracy.rmse(predictions[name])
    print(f'{name}: {result}, fit_time: {round(end-begin,2)}')
    print()

# print("----------- 1m ---------------")
# for name,algo in algo_dict.items():
#     begin = time.time()
#     algo.fit(train_data_1m)
#     end = time.time()
#     predictions[name] = algo.test(test_data_1m)
#     result = accuracy.rmse(predictions[name])
#     print(f'{name}: {result}, fit_time: {round(end-begin,2)}')
#     print()

# print("----------- 10m ---------------")
# for name,algo in algo_dict.items():
#     begin = time.time()
#     algo.fit(train_data_10m)
#     end = time.time()
#     predictions[name] = algo.test(test_data_10m)
#     result = accuracy.rmse(predictions[name])
#     print(f'{name}: {result}, fit_time: {round(end-begin,2)}')
#     print()

----------- 100k ---------------
RMSE: 0.9439
SVD: 0.9438516881940346, fit_time: 0.53

RMSE: 0.9738
CoClustering: 0.9737509607466123, fit_time: 0.84

RMSE: 0.9477
SlopeOne: 0.9477498649038465, fit_time: 0.4



## Preparing dataframe for predictions

In [7]:
test_data_100k = pd.DataFrame(test_data_100k)
test_data_100k = test_data_100k.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating"
})

# test_data_1m = pd.DataFrame(test_data_1m)
# test_data_1m = test_data_1m.rename(columns={
#     0: "uid",
#     1: "iid",
#     2: "rating"
# })

# test_data_10m = pd.DataFrame(test_data_10m)
# test_data_10m = test_data_10m.rename(columns={
#     0: "uid",
#     1: "iid",
#     2: "rating"
# })

## Compare time taken to predict missing data

In [8]:
pred_dict_100k={}
pred_dict_1m={}
pred_dict_10m={}


print("----------- 100k ---------------")
for name,algo in algo_dict.items():
    pred_dict_100k[name] = []
    begin = time.time()
    for i in range(len(test_data_100k)):
        y_pred = algo.predict(test_data_100k.loc[i]['uid'], test_data_100k.loc[i]['iid'])
        y_pred = np.array(y_pred)
        y_pred[2] = test_data_100k.loc[i]['rating']
        pred_dict_100k[name].append(y_pred)
    end = time.time() 
    print(name)
    print(f"time taken to generate test predictions = {round(end-begin, 2)} seconds")
    print()
    
# print("----------- 1m ---------------")
# for name,algo in algo_dict.items():
#     pred_dict_1m[name] = []
#     begin = time.time()
#     for i in range(len(test_data_1m)):
#         y_pred = algo.predict(test_data_1m.loc[i]['uid'], test_data_1m.loc[i]['iid'])
#         y_pred = np.array(y_pred)
#         y_pred[2] = test_data_1m.loc[i]['rating']
#         pred_dict_1m[name].append(y_pred)
#     end = time.time() 
#     print(name)
#     print(f"time taken to generate test predictions = {round(end-begin, 2)} seconds")
#     print()

    
# print("----------- 10m ---------------")
# for name,algo in algo_dict.items():
#     pred_dict_10m[name] = []
#     begin = time.time()
#     for i in range(len(test_data_10m)):
#         y_pred = algo.predict(test_data_10m.loc[i]['uid'], test_data_10m.loc[i]['iid'])
#         y_pred = np.array(y_pred)
#         y_pred[2] = test_data_10m.loc[i]['rating']
#         pred_dict_10m[name].append(y_pred)
#     end = time.time() 
#     print(name)
#     print(f"time taken to generate test predictions = {round(end-begin, 2)} seconds")
#     print()


----------- 100k ---------------
SVD
time taken to generate test predictions = 5.12 seconds

CoClustering
time taken to generate test predictions = 4.86 seconds

SlopeOne
time taken to generate test predictions = 6.99 seconds



## Analysis of prediction results

In [13]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0

df_dict_100k = {}
for name,array in pred_dict_100k.items():
    trainset = algo_dict[name].trainset
    df_dict_100k[name] = pd.DataFrame(pred_dict_100k[name])
    df_dict_100k[name] = df_dict_100k[name].rename(columns={
        0: "uid",
        1: "iid",
        2: "actual rating",
        3: "predicted rating",
        4: "-"
    })
    df_dict_100k[name]['no._items_rated_by_user'] = df_dict_100k[name].uid.apply(get_Iu)
    df_dict_100k[name]['no._user_that_rated_item'] = df_dict_100k[name].iid.apply(get_Ui)
    df_dict_100k[name]['error'] = abs(df_dict_100k[name]["actual rating"]-df_dict_100k[name]["predicted rating"])

# df_dict_1m = {}
# for name,array in pred_dict_1m.items():
#     trainset = algo_dict[name].trainset
#     df_dict_1m[name] = pd.DataFrame(pred_dict_1m[name])
#     df_dict_1m[name] = df_dict_1m[name].rename(columns={
#         0: "uid",
#         1: "iid",
#         2: "actual rating",
#         3: "predicted rating",
#         4: "-"
#     })
#     df_dict_1m[name]['no._items_rated_by_user'] = df_dict_1m[name].uid.apply(get_Iu)
#     df_dict_1m[name]['no._user_that_rated_item'] = df_dict_1m[name].iid.apply(get_Ui)
#     df_dict_1m[name]['error'] = abs(df_dict_1m[name]["actual rating"]-df_dict_1m[name]["predicted rating"])

# df_dict_10m = {}
# for name,array in pred_dict_10m.items():
#     trainset = algo_dict[name].trainset
#     df_dict_10m[name] = pd.DataFrame(pred_dict_10m[name])
#     df_dict_10m[name] = df_dict_10m[name].rename(columns={
#         0: "uid",
#         1: "iid",
#         2: "actual rating",
#         3: "predicted rating",
#         4: "-"
#     })
#     df_dict_10m[name]['no._items_rated_by_user'] = df_dict_10m[name].uid.apply(get_Iu)
#     df_dict_10m[name]['no._user_that_rated_item'] = df_dict_10m[name].iid.apply(get_Ui)
#     df_dict_10m[name]['error'] = abs(df_dict_10m[name]["actual rating"]-df_dict_10m[name]["predicted rating"])


In [None]:
best_predictions = df_dict_100k["SVD"].sort_values(by='error')[:10]
worst_predictions = df_dict_100k["SVD"].sort_values(by='error')[-10:]

In [None]:
best_predictions

In [None]:
worst_predictions

## Analysis of worst prediction

In [None]:
df_100k = pd.DataFrame(data_100k.__dict__['raw_ratings'], columns=['uid','iid','rating','timestamp'])

df_100k.loc[df_100k['iid'] == '1090']['rating'].describe()

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook

df_100k.loc[df_100k['iid'] == '1090']['rating'].hist()
plt.xlabel('rating')
plt.ylabel('Number of ratings')
plt.title('Number of ratings item 1090 has received')
plt.show();