In [123]:
# Import the correct algorithm
from surprise import Dataset, Reader, accuracy, SVD, CoClustering, SlopeOne 
from surprise.accuracy import rmse
from surprise.model_selection import GridSearchCV, train_test_split
import pandas as pd
import time
import pathlib
import numpy as np
import math

## Load Datasets

In [173]:
data_100k =  Dataset.load_builtin('ml-100k')
data_1m = Dataset.load_builtin('ml-1m')
# path = pathlib.Path('ml-10M100K').resolve() / 'ratings.dat'
# reader = Reader(line_format="user item rating timestamp",sep='::')
# print(path)
# data_10m = Dataset.load_from_file(file_path=path,reader=reader)

In [174]:
# 'user item rating timestamp', separated by '::' characters.
# reader = Reader(line_format="user item rating timestamp",sep='::')

# param_grid = {"n_epochs": [10], 
#               "lr_all": [0.002,0.003,0.005], 
#               "reg_all": [0.01,0.02,0.03],
#               'biased':[True,False],'random_state':[1],
#               'verbose':[True]}

# param_grid = {"n_cltr_u": [2,3,4], 
#               "n_cltr_i": [2,3,4], 
#               "random_state":[1],
#               "verbose":[True]}

# gs_SVD = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_SlopeOne = GridSearchCV(SlopeOne, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering = GridSearchCV(CoClustering, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
# gs_CoClustering.param_combinations

# gs_SVD.fit(data_100k)
# gs_SlopeOne.fit(data_100k)
# gs_CoClustering.fit(data_100k)
# print("SVD - 100k")
# print('RMSE_best_score:',gs_SVD.best_score["rmse"])
# print('RMSE_best_params',gs_SVD.best_params["rmse"])
# train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
# algo = gs.best_estimator["rmse"]
# algo.fit(train_data_100k)

## Initialise diff algo

In [178]:
svd = SVD()
co_clustering = CoClustering()
slope_one = SlopeOne()
algo_dict = {"SVD":svd,"CoClustering":co_clustering,"SlopeOne":slope_one}

## Split training and test data

In [179]:
# Test set is 25%
train_data_100k, test_data_100k = train_test_split(data_100k, test_size=0.25, random_state=1)
train_data_1m, test_data_1m = train_test_split(data_1m, test_size=0.25, random_state=1)
# train_data_10m, test_data_10m = train_test_split(data_10m, test_size=0.25, random_state=1)

## Compare fit timing and prediction accuracy

In [180]:
predictions = {}

# print("----------- 100k ---------------")
# for name,algo in algo_dict.items():
#     begin = time.time()
#     algo.fit(train_data_100k)
#     end = time.time()
#     predictions[name] = algo.test(test_data_100k)
#     result = accuracy.rmse(predictions[name])
#     print(f'{name}: {result}, fit_time: {round(end-begin,2)}')
#     print()

print("----------- 1m ---------------")
for name,algo in algo_dict.items():
    begin = time.time()
    algo.fit(train_data_1m)
    end = time.time()
    predictions[name] = algo.test(test_data_1m)
    result = accuracy.rmse(predictions[name])
    print(f'{name}: {result}, fit_time: {round(end-begin,2)}')
    print()

# print("----------- 10m ---------------")
# for name,algo in algo_dict.items():
#     begin = time.time()
#     algo.fit(train_data_10m)
#     end = time.time()
#     predictions[name] = algo.test(test_data_10m)
#     result = accuracy.rmse(predictions[name])
#     print(f'{name}: {result}, fit_time: {round(end-begin,2)}')
#     print()

----------- 1m ---------------
RMSE: 0.8764
SVD: 0.8764239195052286, fit_time: 7.35

RMSE: 0.9157
CoClustering: 0.9156636169630048, fit_time: 11.41

RMSE: 0.9062
SlopeOne: 0.9061517699339126, fit_time: 9.86



## Preparing dataframe for predictions

In [6]:
test_data_100k = pd.DataFrame(test_data_100k)
test_data_100k = test_data_100k.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating"
})

# test_data_1m = pd.DataFrame(test_data_1m)
# test_data_1m = test_data_1m.rename(columns={
#     0: "uid",
#     1: "iid",
#     2: "rating"
# })

# test_data_10m = pd.DataFrame(test_data_10m)
# test_data_10m = test_data_10m.rename(columns={
#     0: "uid",
#     1: "iid",
#     2: "rating"
# })

## Compare time taken to predict missing data

In [7]:
pred_dict_100k={}
pred_dict_1m={}
pred_dict_10m={}


print("----------- 100k ---------------")
for name,algo in algo_dict.items():
    pred_dict_100k[name] = []
    total_time = 0.00
    for i in range(len(test_data_100k)):
        begin = time.time()
        y_pred = algo.predict(test_data_100k.loc[i]['uid'], test_data_100k.loc[i]['iid'])
        end = time.time()
        y_pred = np.array(y_pred)
        y_pred[2] = test_data_100k.loc[i]['rating']
        pred_dict_100k[name].append(y_pred)
        total_time += end-begin
    print(name)
    print(f"time taken to generate test predictions = {round(total_time, 2)} seconds")
    print()
    
# print("----------- 1m ---------------")
# for name,algo in algo_dict.items():
#     pred_dict_1m[name] = []
#     total_time = 0.00
#     for i in range(len(test_data_1m)):
#         begin = time.time()
#         y_pred = algo.predict(test_data_1m.loc[i]['uid'], test_data_1m.loc[i]['iid'])
#         end = time.time()
#         y_pred = np.array(y_pred)
#         y_pred[2] = test_data_1m.loc[i]['rating']
#         pred_dict_1m[name].append(y_pred)
#         total_time += end-begin
#     print(name)
#     print(f"time taken to generate test predictions = {round(total_time, 2)} seconds")
#     print()

    
# print("----------- 10m ---------------")
# for name,algo in algo_dict.items():
#     pred_dict_10m[name] = []
#     total_time = 0.00
#     for i in range(len(test_data_10m)):
#         begin = time.time()
#         y_pred = algo.predict(test_data_10m.loc[i]['uid'], test_data_10m.loc[i]['iid'])
#         end = time.time()
#         y_pred = np.array(y_pred)
#         y_pred[2] = test_data_10m.loc[i]['rating']
#         pred_dict_10m[name].append(y_pred)
#         total_time += end-begin
#     print(name)
#     print(f"time taken to generate test predictions = {round(total_time, 2)} seconds")
#     print()


----------- 100k ---------------
SVD
time taken to generate test predictions = 5.09 seconds

CoClustering
time taken to generate test predictions = 4.51 seconds

SlopeOne
time taken to generate test predictions = 6.93 seconds



## Analysis of prediction results

Analyse the Normalised Discounted Cumulative Gain for each clustering method. 
- 0 < NDCG < 1, higher the better
- ![Screenshot%202022-10-13%20at%207.20.30%20PM.png](attachment:Screenshot%202022-10-13%20at%207.20.30%20PM.png)

- ![Screenshot%202022-10-13%20at%207.22.47%20PM.png](attachment:Screenshot%202022-10-13%20at%207.22.47%20PM.png)
- Relevance = 0/1, 0 means that we did not predict the correct item at the correct position, 1 means that we predicted the correct item at the correct position

- iDCG is Ideal Discounted Cumulative Gain. It is calculated by assuming our N predicted recommendations follow exactly the user's top N favourites. In essence, it is the same formula as DCG but relevance = 1 for all N.

- We pick out all the users who gave rating of 5 to less than 5 items. This is to avoid rating of 5 to overcrowd the top N recommendations.

In [181]:
# function to calculate DCG
def generateDCG(actual, pred):
    dcg = 0
    for i in range(10):
        if actual[i] == pred[i]: # whenever there is a match in the item and position, add to the dcg
            dcg += 1/math.log2(i+2)
    return dcg

# function to calculate NDCG 
def generateNDCG(data, uid):
    user = data[data['uid'] == str(uid)]
    
    actual_top10 = user.sort_values(by=['rating'], ascending = False)[:10]
    predicted_top10 = user.sort_values(by=['predicted_rating'], ascending=False)[:10]
    
    # generate idcg where all relevance = 1
    idcg = 0
    for i in range(10):
        idcg += 1/math.log2(i+2)
    
    dcg = generateDCG(actual_top10['iid'].reset_index(drop=True),
                     predicted_top10['iid'].reset_index(drop=True))
    
    return dcg / idcg

### Co-Clustering

In [182]:
# Define algo: SVD, SlopeOne, CoClustering
algo = algo_dict['CoClustering']

# converting data to dataframe so that it's easier for prediction
data_100k_df = pd.DataFrame(data_1m.__dict__['raw_ratings'], 
                            columns=['user_id','item_id','rating','timestamp'])

# generating prediction for the ENTIRE dataset which means it includes the test and train data too
all_pred_dict_100k = []
print("----------- 1m ---------------")
for i in range(len(data_100k_df)):
    y_pred = algo.predict(data_100k_df.loc[i]['user_id'], data_100k_df.loc[i]['item_id'])
    y_pred = np.array(y_pred)
    y_pred[2] = data_100k_df.loc[i]['rating']
    all_pred_dict_100k.append(y_pred)

# converting data to dataframe so that it's easier below
all_data_100k = pd.DataFrame(all_pred_dict_100k)
all_data_100k = all_data_100k.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating",
    3: "predicted_rating"
})

# group the data based on user id and rating and count
# this will return the number of items a user gave a rating of {1,2,3,4,5}
grouped = data_100k_df.groupby(by=['user_id','rating']).count()

# look for potential users to do our test on
# the condition is that they have to have given less than 5 items a rating of 5
candidate_users = []
for i in range(len(data_100k_df)):
    try:
        length = grouped.loc[(str(i+1), 5)]['item_id']
        if length < 5:
            candidate_users.append(i+1)
    except:
        continue
        
# # First 10 users will be used for evaluation
# users = []
# for i in range(10):
#     users.append(candidate_users[i])

ndcg_CC = []
for i in candidate_users:
    ndcg = generateNDCG(all_data_100k, i)
    ndcg_CC.append(ndcg)
    print(f"For user {i}, NDCG = {ndcg}")

----------- 1m ---------------
For user 15, NDCG = 0.0
For user 21, NDCG = 0.0
For user 47, NDCG = 0.06943122193677727
For user 50, NDCG = 0.0
For user 71, NDCG = 0.0
For user 100, NDCG = 0.0
For user 108, NDCG = 0.22009176629808017
For user 122, NDCG = 0.06362078819895171
For user 160, NDCG = 0.0
For user 167, NDCG = 0.0
For user 172, NDCG = 0.0
For user 179, NDCG = 0.22009176629808017
For user 207, NDCG = 0.31488013066763093
For user 209, NDCG = 0.13886244387355454
For user 219, NDCG = 0.06362078819895171
For user 221, NDCG = 0.22009176629808017
For user 227, NDCG = 0.13886244387355454
For user 250, NDCG = 0.0
For user 254, NDCG = 0.06625422345438903
For user 276, NDCG = 0.0
For user 277, NDCG = 0.06362078819895171
For user 279, NDCG = 0.0
For user 282, NDCG = 0.0
For user 317, NDCG = 0.07336392209936005
For user 328, NDCG = 0.07336392209936005
For user 360, NDCG = 0.0
For user 364, NDCG = 0.0
For user 373, NDCG = 0.0
For user 383, NDCG = 0.06943122193677727
For user 384, NDCG = 0.0


For user 4184, NDCG = 0.2934556883974402
For user 4192, NDCG = 0.22009176629808017
For user 4209, NDCG = 0.21726071285222986
For user 4211, NDCG = 0.0
For user 4270, NDCG = 0.0
For user 4284, NDCG = 0.0
For user 4316, NDCG = 0.09478836436955078
For user 4325, NDCG = 0.22009176629808017
For user 4330, NDCG = 0.0
For user 4338, NDCG = 0.0
For user 4341, NDCG = 0.0
For user 4349, NDCG = 0.0
For user 4365, NDCG = 0.20248323207250624
For user 4366, NDCG = 0.0
For user 4383, NDCG = 0.0
For user 4399, NDCG = 0.07839826897867533
For user 4419, NDCG = 0.0
For user 4421, NDCG = 0.15176219107803537
For user 4443, NDCG = 0.0
For user 4450, NDCG = 0.0
For user 4453, NDCG = 0.0
For user 4464, NDCG = 0.0
For user 4481, NDCG = 0.22009176629808017
For user 4500, NDCG = 0.1478294909154526
For user 4530, NDCG = 0.11004588314904008
For user 4547, NDCG = 0.0
For user 4548, NDCG = 0.0
For user 4554, NDCG = 0.15457433957839825
For user 4570, NDCG = 0.07336392209936005
For user 4577, NDCG = 0.0
For user 4587,

### SVD

In [183]:
# Define algo: SVD, SlopeOne, CoClustering
algo = algo_dict['SVD']

# converting data to dataframe so that it's easier for prediction
data_100k_df = pd.DataFrame(data_1m.__dict__['raw_ratings'], 
                            columns=['user_id','item_id','rating','timestamp'])

# generating prediction for the ENTIRE dataset which means it includes the test and train data too
all_pred_dict_100k = []
print("----------- 1m ---------------")
for i in range(len(data_100k_df)):
    y_pred = algo.predict(data_100k_df.loc[i]['user_id'], data_100k_df.loc[i]['item_id'])
    y_pred = np.array(y_pred)
    y_pred[2] = data_100k_df.loc[i]['rating']
    all_pred_dict_100k.append(y_pred)

# converting data to dataframe so that it's easier below
all_data_100k = pd.DataFrame(all_pred_dict_100k)
all_data_100k = all_data_100k.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating",
    3: "predicted_rating"
})

# group the data based on user id and rating and count
# this will return the number of items a user gave a rating of {1,2,3,4,5}
grouped = data_100k_df.groupby(by=['user_id','rating']).count()

# look for potential users to do our test on
# the condition is that they have to have given less than 5 items a rating of 5
candidate_users = []
for i in range(len(data_100k_df)):
    try:
        length = grouped.loc[(str(i+1), 5)]['item_id']
        if length < 5:
            candidate_users.append(i+1)
    except:
        continue
        
# # First 10 users will be used for evaluation
# users = []
# for i in range(10):
#     users.append(candidate_users[i])

ndcg_SVD = []
for i in candidate_users:
    ndcg = generateNDCG(all_data_100k, i)
    ndcg_SVD.append(ndcg)
    print(f"For user {i}, NDCG = {ndcg}")

----------- 1m ---------------
For user 15, NDCG = 0.13886244387355454
For user 21, NDCG = 0.0
For user 47, NDCG = 0.0
For user 50, NDCG = 0.22009176629808017
For user 71, NDCG = 0.11004588314904008
For user 100, NDCG = 0.07336392209936005
For user 108, NDCG = 0.22009176629808017
For user 122, NDCG = 0.0
For user 160, NDCG = 0.06362078819895171
For user 167, NDCG = 0.13886244387355454
For user 172, NDCG = 0.0
For user 179, NDCG = 0.0
For user 207, NDCG = 0.0
For user 209, NDCG = 0.09478836436955078
For user 219, NDCG = 0.13886244387355454
For user 221, NDCG = 0.22009176629808017
For user 227, NDCG = 0.4537425745411855
For user 250, NDCG = 0.0
For user 254, NDCG = 0.33013764944712026
For user 276, NDCG = 0.0
For user 277, NDCG = 0.0
For user 279, NDCG = 0.0
For user 282, NDCG = 0.06943122193677727
For user 317, NDCG = 0.18340980524840012
For user 328, NDCG = 0.5326208815196265
For user 360, NDCG = 0.0
For user 364, NDCG = 0.0
For user 373, NDCG = 0.0
For user 383, NDCG = 0.0
For user 38

For user 3944, NDCG = 0.0
For user 3959, NDCG = 0.22009176629808017
For user 3962, NDCG = 0.0
For user 3969, NDCG = 0.31488013066763093
For user 3972, NDCG = 0.20248323207250624
For user 3980, NDCG = 0.0
For user 3983, NDCG = 0.0
For user 4008, NDCG = 0.0
For user 4010, NDCG = 0.0
For user 4051, NDCG = 0.0
For user 4056, NDCG = 0.06943122193677727
For user 4059, NDCG = 0.0
For user 4061, NDCG = 0.22009176629808017
For user 4073, NDCG = 0.0
For user 4178, NDCG = 0.0
For user 4184, NDCG = 0.3714891073940902
For user 4192, NDCG = 0.33013764944712026
For user 4209, NDCG = 0.13886244387355454
For user 4211, NDCG = 0.0
For user 4270, NDCG = 0.0
For user 4284, NDCG = 0.07839826897867533
For user 4316, NDCG = 0.0
For user 4325, NDCG = 0.07839826897867533
For user 4330, NDCG = 0.0
For user 4338, NDCG = 0.0
For user 4341, NDCG = 0.0
For user 4349, NDCG = 0.0
For user 4365, NDCG = 0.13886244387355454
For user 4366, NDCG = 0.11004588314904008
For user 4383, NDCG = 0.2863459897524692
For user 4399,

### Slope One

In [184]:
# Define algo: SVD, SlopeOne, CoClustering
algo = algo_dict['SlopeOne']

# converting data to dataframe so that it's easier for prediction
data_100k_df = pd.DataFrame(data_1m.__dict__['raw_ratings'], 
                            columns=['user_id','item_id','rating','timestamp'])

# generating prediction for the ENTIRE dataset which means it includes the test and train data too
all_pred_dict_100k = []
print("----------- 1m ---------------")
for i in range(len(data_100k_df)):
    y_pred = algo.predict(data_100k_df.loc[i]['user_id'], data_100k_df.loc[i]['item_id'])
    y_pred = np.array(y_pred)
    y_pred[2] = data_100k_df.loc[i]['rating']
    all_pred_dict_100k.append(y_pred)

# converting data to dataframe so that it's easier below
all_data_100k = pd.DataFrame(all_pred_dict_100k)
all_data_100k = all_data_100k.rename(columns={
    0: "uid",
    1: "iid",
    2: "rating",
    3: "predicted_rating"
})

# group the data based on user id and rating and count
# this will return the number of items a user gave a rating of {1,2,3,4,5}
grouped = data_100k_df.groupby(by=['user_id','rating']).count()

# look for potential users to do our test on
# the condition is that they have to have given less than 5 items a rating of 5
candidate_users = []
for i in range(len(data_100k_df)):
    try:
        length = grouped.loc[(str(i+1), 5)]['item_id']
        if length < 5:
            candidate_users.append(i+1)
    except:
        continue
        
# # First 10 users will be used for evaluation
# users = []
# for i in range(10):
#     users.append(candidate_users[i])

ndcg_SlopeOne = []
for i in candidate_users:
    ndcg = generateNDCG(all_data_100k, i)
    ndcg_SlopeOne.append(ndcg)
    print(f"For user {i}, NDCG = {ndcg}")

----------- 1m ---------------
For user 15, NDCG = 0.0
For user 21, NDCG = 0.0
For user 47, NDCG = 0.13886244387355454
For user 50, NDCG = 0.22009176629808017
For user 71, NDCG = 0.06943122193677727
For user 100, NDCG = 0.13886244387355454
For user 108, NDCG = 0.28371255449703187
For user 122, NDCG = 0.06362078819895171
For user 160, NDCG = 0.0
For user 167, NDCG = 0.11004588314904008
For user 172, NDCG = 0.08514311764162098
For user 179, NDCG = 0.22009176629808017
For user 207, NDCG = 0.42520843362602373
For user 209, NDCG = 0.0
For user 219, NDCG = 0.1736666713479918
For user 221, NDCG = 0.06625422345438903
For user 227, NDCG = 0.0
For user 250, NDCG = 0.0
For user 254, NDCG = 0.0
For user 276, NDCG = 0.11004588314904008
For user 277, NDCG = 0.0
For user 279, NDCG = 0.0
For user 282, NDCG = 0.22009176629808017
For user 317, NDCG = 0.07336392209936005
For user 328, NDCG = 0.09478836436955078
For user 360, NDCG = 0.0
For user 364, NDCG = 0.07336392209936005
For user 373, NDCG = 0.0
For

For user 4178, NDCG = 0.0
For user 4184, NDCG = 0.22009176629808017
For user 4192, NDCG = 0.28371255449703187
For user 4209, NDCG = 0.0
For user 4211, NDCG = 0.11004588314904008
For user 4270, NDCG = 0.13886244387355454
For user 4284, NDCG = 0.0
For user 4316, NDCG = 0.0
For user 4325, NDCG = 0.3589542101716347
For user 4330, NDCG = 0.0
For user 4338, NDCG = 0.0
For user 4341, NDCG = 0.0
For user 4349, NDCG = 0.0
For user 4365, NDCG = 0.0
For user 4366, NDCG = 0.0
For user 4383, NDCG = 0.0
For user 4399, NDCG = 0.0
For user 4419, NDCG = 0.07839826897867533
For user 4421, NDCG = 0.21726071285222986
For user 4443, NDCG = 0.0
For user 4450, NDCG = 0.11004588314904008
For user 4453, NDCG = 0.08514311764162098
For user 4464, NDCG = 0.22009176629808017
For user 4481, NDCG = 0.0
For user 4500, NDCG = 0.06943122193677727
For user 4530, NDCG = 0.0
For user 4547, NDCG = 0.11004588314904008
For user 4548, NDCG = 0.4440973278132557
For user 4554, NDCG = 0.06362078819895171
For user 4570, NDCG = 0.

In [185]:
print("--1m--")
print(f"Average Normalised Discounted Cumulative Mean for Co-Clustering: {np.mean(ndcg_CC)}")
print(f"Average Normalised Discounted Cumulative Mean for SVD: {np.mean(ndcg_SVD)}")
print(f"Average Normalised Discounted Cumulative Mean for Slope One: {np.mean(ndcg_SlopeOne)}")

--1m--
Average Normalised Discounted Cumulative Mean for Co-Clustering: 0.07184225525562561
Average Normalised Discounted Cumulative Mean for SVD: 0.11031873177957066
Average Normalised Discounted Cumulative Mean for Slope One: 0.08215823670608398
