In [1]:
# Import the correct algorithm
from surprise import Dataset, Reader, accuracy, SVD, CoClustering, SlopeOne 
from surprise.accuracy import rmse
from surprise.model_selection import GridSearchCV, train_test_split
import pandas as pd
import time
import pathlib
import numpy as np

## Load Datasets

In [11]:
data_100k =  Dataset.load_builtin('ml-100k')
data_1m = Dataset.load_builtin('ml-1m')
path = pathlib.Path('ml-10M100K').resolve() / 'ratings.dat'
reader = Reader(line_format="user item rating timestamp",sep='::')
print(path)
data_10m = Dataset.load_from_file(file_path=path,reader=reader)

/Users/siuu/dev/CZ4032/ml-10M100K/ratings.dat


In [12]:
# 'user item rating timestamp', separated by '::' characters.
# reader = Reader(line_format="user item rating timestamp",sep='::')

# param_grid = {"n_epochs": [10], 
#               "lr_all": [0.002,0.003,0.005], 
#               "reg_all": [0.01,0.02,0.03],
#               'biased':[True,False],'random_state':[1],
#               'verbose':[True]}

# param_grid = {"n_cltr_u": [2,3,4], 
#               "n_cltr_i": [2,3,4], 
#               "random_state":[1],
#               "verbose":[True]}

# gs_SVD = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_SlopeOne = GridSearchCV(SlopeOne, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering = GridSearchCV(CoClustering, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering.param_combinations

# gs_SVD.fit(data_100k)
# gs_SlopeOne.fit(data_100k)
# gs_CoClustering.fit(data_100k)
# print("SVD - 100k")
# print('RMSE_best_score:',gs_SVD.best_score["rmse"])
# print('RMSE_best_params',gs_SVD.best_params["rmse"])
# train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
# algo = gs.best_estimator["rmse"]
# algo.fit(train_data_100k)

## Initialise diff algo

In [13]:
svd = SVD()
co_clustering = CoClustering()
slope_one = SlopeOne()
algo_dict = {"SVD":svd,"CoClustering":co_clustering,"SlopeOne":slope_one}

## Split training and test data

In [14]:
# Test set is 25%
train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
train_data_1m, test_data_1m = train_test_split(data_1m, test_size=0.25, random_state=1)
train_data_10m, test_data_10m = train_test_split(data_10m, test_size=0.25, random_state=1)

## Comparing fit timing and accuracy of each model

In [None]:
predictions = {}
results_100k=pd.DataFrame(columns=['Algorithm','Fit Time','Prediction Time'])
results_1m=pd.DataFrame(columns=['Algorithm','Fit Time','Prediction Time'])
results_10m=pd.DataFrame(columns=['Algorithm','Fit Time','Prediction Time'])

print("----------- 100k ---------------")
for name,algo in algo_dict.items():
    begin_fit = time.time()
    algo.fit(train_data_100k)
    end_fit = time.time()
    begin_predict = time.time()
    predictions[name] = algo.test(test_data_100k)
    end_predict = time.time()
    result = accuracy.rmse(predictions[name])
    row = {"Algorithm":name,"Fit Time":end_fit-begin_fit,"Prediction Time":end_predict-begin_predict}
    print(row)
    results_100k=results_100k.append(row,ignore_index=True)
    

print("----------- 1m ---------------")
for name,algo in algo_dict.items():
    begin_fit = time.time()
    algo.fit(train_data_1m)
    end_fit = time.time()
    begin_predict = time.time()
    predictions[name] = algo.test(test_data_1m)
    end_predict = time.time()
    result = accuracy.rmse(predictions[name])
    row = {"Algorithm":name,"Fit Time":end_fit-begin_fit,"Prediction Time":end_predict-begin_predict}
    print(row)
    results_1m=results_1m.append(row,ignore_index=True)

print("----------- 10m ---------------")
for name,algo in algo_dict.items():
    begin_fit = time.time()
    algo.fit(train_data_10m)
    end_fit = time.time()
    begin_predict = time.time()
    predictions[name] = algo.test(test_data_10m)
    end_predict = time.time()
    result = accuracy.rmse(predictions[name])
    row = {"Algorithm":name,"Fit Time":end_fit-begin_fit,"Prediction Time":end_predict-begin_predict}
    print(row)
    results_10m=results_10m.append(row,ignore_index=True)

## Results for 100k Dataset

In [16]:
results_100k

Unnamed: 0,Algorithm,Fit Time,Prediction Time
0,SVD,0.774207,0.157203
1,CoClustering,1.421905,0.118595
2,SlopeOne,0.58992,2.235047


## Results for 1m Dataset

In [17]:
results_1m

Unnamed: 0,Algorithm,Fit Time,Prediction Time
0,SVD,8.193289,1.607563
1,CoClustering,15.70024,1.127492
2,SlopeOne,12.65513,41.885131


## Results for 10m Dataset

In [19]:
results_10m

Unnamed: 0,Algorithm,Fit Time,Prediction Time
0,SVD,83.190718,27.148635
1,CoClustering,215.714359,23.304742
2,SlopeOne,185.632412,602.402012


## Preparing dataframe for predictions

In [6]:
test_data_100k = pd.DataFrame(test_data_100k)
test_data_100k = test_data_100k.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating"
})

test_data_1m = pd.DataFrame(test_data_1m)
test_data_1m = test_data_1m.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating"
})

test_data_10m = pd.DataFrame(test_data_10m)
test_data_10m = test_data_10m.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating"
})

## Analysis of prediction results

In [20]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0

df_dict_100k = {}
for name,array in pred_dict_100k.items():
    trainset = algo_dict[name].trainset
    df_dict_100k[name] = pd.DataFrame(pred_dict_100k[name])
    df_dict_100k[name] = df_dict_100k[name].rename(columns={
        0: "uid",
        1: "iid",
        2: "actual rating",
        3: "predicted rating",
        4: "-"
    })
    df_dict_100k[name]['no._items_rated_by_user'] = df_dict_100k[name].uid.apply(get_Iu)
    df_dict_100k[name]['no._user_that_rated_item'] = df_dict_100k[name].iid.apply(get_Ui)
    df_dict_100k[name]['error'] = abs(df_dict_100k[name]["actual rating"]-df_dict_100k[name]["predicted rating"])

# df_dict_1m = {}
# for name,array in pred_dict_1m.items():
#     trainset = algo_dict[name].trainset
#     df_dict_1m[name] = pd.DataFrame(pred_dict_1m[name])
#     df_dict_1m[name] = df_dict_1m[name].rename(columns={
#         0: "uid",
#         1: "iid",
#         2: "actual rating",
#         3: "predicted rating",
#         4: "-"
#     })
#     df_dict_1m[name]['no._items_rated_by_user'] = df_dict_1m[name].uid.apply(get_Iu)
#     df_dict_1m[name]['no._user_that_rated_item'] = df_dict_1m[name].iid.apply(get_Ui)
#     df_dict_1m[name]['error'] = abs(df_dict_1m[name]["actual rating"]-df_dict_1m[name]["predicted rating"])

# df_dict_10m = {}
# for name,array in pred_dict_10m.items():
#     trainset = algo_dict[name].trainset
#     df_dict_10m[name] = pd.DataFrame(pred_dict_10m[name])
#     df_dict_10m[name] = df_dict_10m[name].rename(columns={
#         0: "uid",
#         1: "iid",
#         2: "actual rating",
#         3: "predicted rating",
#         4: "-"
#     })
#     df_dict_10m[name]['no._items_rated_by_user'] = df_dict_10m[name].uid.apply(get_Iu)
#     df_dict_10m[name]['no._user_that_rated_item'] = df_dict_10m[name].iid.apply(get_Ui)
#     df_dict_10m[name]['error'] = abs(df_dict_10m[name]["actual rating"]-df_dict_10m[name]["predicted rating"])


In [25]:
best_predictions = df_dict_100k["SVD"].sort_values(by='actual rating',ascending=False)[:50]
worst_predictions = df_dict_100k["SVD"].sort_values(by='actual rating',ascending=False)[-50:]

In [27]:
best_predictions

Unnamed: 0,uid,iid,actual rating,predicted rating,-,no._items_rated_by_user,no._user_that_rated_item,error
10261,664,480,5.0,3.981846,{'was_impossible': False},23,24558,1.018154
18840,274,100,5.0,3.279461,{'was_impossible': False},249,1948,1.720539
18830,848,428,5.0,4.850245,{'was_impossible': False},24,2046,0.149755
4274,94,464,5.0,2.869451,{'was_impossible': False},92,1478,2.130549
8126,94,959,5.0,4.075642,{'was_impossible': False},92,187,0.924358
4271,343,176,5.0,3.608604,{'was_impossible': False},24,1077,1.391396
4268,389,494,5.0,3.015706,{'was_impossible': False},91,5723,1.984294
8130,881,663,5.0,3.153465,{'was_impossible': False},65,1632,1.846535
13231,330,204,5.0,3.728757,{'was_impossible': False},107,3774,1.271243
4265,452,269,5.0,3.0494,{'was_impossible': False},103,300,1.9506


In [12]:
worst_predictions

Unnamed: 0,uid,iid,actual rating,predicted rating,-,no._items_rated_by_user,no._user_that_rated_item,error
6048,509,892,1.0,2.050783,{'was_impossible': False},22,40,1.050783
22267,405,647,1.0,2.178123,{'was_impossible': False},567,51,1.178123
12808,416,240,1.0,3.198073,{'was_impossible': False},379,119,2.198073
1856,5,437,1.0,2.358568,{'was_impossible': False},129,3,1.358568
18461,399,552,1.0,2.364403,{'was_impossible': False},229,34,1.364403
8430,254,90,1.0,2.753745,{'was_impossible': False},122,69,1.753745
12799,697,1245,1.0,3.814652,{'was_impossible': False},84,14,2.814652
1842,223,826,1.0,2.380455,{'was_impossible': False},79,58,1.380455
22288,405,772,1.0,1.435761,{'was_impossible': False},567,34,0.435761
22289,405,1553,1.0,2.248562,{'was_impossible': False},567,3,1.248562


## Analysis of worst prediction

In [None]:
df_100k = pd.DataFrame(data_100k.__dict__['raw_ratings'], columns=['uid','iid','rating','timestamp'])

df_100k.loc[df_100k['iid'] == '1090']['rating'].describe()

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook

df_100k.loc[df_100k['iid'] == '1090']['rating'].hist()
plt.xlabel('rating')
plt.ylabel('Number of ratings')
plt.title('Number of ratings item 1090 has received')
plt.show();

# Analysing the Sparsity of each Datasets