In [1]:
import numpy as np
import pandas as pd
import scipy as sp
# import AgglomerativeClustering from sklearn
from sklearn.cluster import AgglomerativeClustering
from math import sqrt


In [2]:
path = "dataset"
top_percent = 0.4

## Creating the data matrix

In [3]:
# Creating MxN matrix where M is the number of users and N is the number of books
# an Example of the representation of the matrix is shown below
#       Items  I1        I2        I3      I4      I5
# Users
# U1           1.0       4.0       5.0     3.0     2.0
# U2           2.0       3.0       4.0     5.0     1.0
# U3           0.0       4.0       3.0     0.0     1.0
# U4           1.0       2.0       3.0     4.0     5.0
# U5           5.0       4.0       3.0     2.0     1.0

dataframe = {"Users": ["U1", "U1", "U1", "U1", "U1",
                        "U2", "U2", "U2", "U2", "U2", 
                        "U3", "U3", "U3", "U3", "U3", 
                        "U4", "U4", "U4", "U4", "U4", 
                        "U5", "U5", "U5", "U5", "U5"], 
              "Items": ["I1", "I2", "I3", "I4", "I5",
                        "I1", "I2", "I3", "I4", "I5",
                        "I1", "I2", "I3", "I4", "I5",
                        "I1", "I2", "I3", "I4", "I5",
                        "I1", "I2", "I3", "I4", "I5"],
             "Ratings": [1.0, 4.0, 5.0, 3.0, 2.0,
                        2.0, 3.0, 4.0, 5.0, 1.0,
                        np.nan, 4.0, 3.0, np.nan, 1.0,
                        1.0, 2.0, 3.0, 4.0, 5.0,
                        5.0, 4.0, 3.0, 2.0, 1.0]}



df = pd.DataFrame(dataframe)

# must include the Nan values in the pivot table
rating_matrix = df.pivot(index="Users", columns="Items", values="Ratings")
print(rating_matrix)

mean_centered_matrix = rating_matrix.sub(rating_matrix.mean(axis=1), axis=0)
print(mean_centered_matrix)






Items   I1   I2   I3   I4   I5
Users                         
U1     1.0  4.0  5.0  3.0  2.0
U2     2.0  3.0  4.0  5.0  1.0
U3     NaN  4.0  3.0  NaN  1.0
U4     1.0  2.0  3.0  4.0  5.0
U5     5.0  4.0  3.0  2.0  1.0
Items   I1        I2        I3   I4        I5
Users                                        
U1    -2.0  1.000000  2.000000  0.0 -1.000000
U2    -1.0  0.000000  1.000000  2.0 -2.000000
U3     NaN  1.333333  0.333333  NaN -1.666667
U4    -2.0 -1.000000  0.000000  1.0  2.000000
U5     2.0  1.000000  0.000000 -1.0 -2.000000


## User-based collaborative filtering

In [4]:
user_pearson_similarity_matrix = rating_matrix.copy()

# use corr function to calculate pearson correlation between users (columns)
user_pearson_similarity_matrix = user_pearson_similarity_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)
user_pearson_similarity_matrix = user_pearson_similarity_matrix.T.corr(method="pearson")





In [5]:

def similar_users(userID: str, top_percent: int, threshold: float = 0.2):
    index = rating_matrix.index.get_loc(userID)
    row_rating = user_pearson_similarity_matrix.iloc[index]
    row_rating.sort_values(ascending=False, inplace=True)
    top_users = round(len(user_pearson_similarity_matrix) * top_percent)
    user_list = list(row_rating.index[1:top_users+1])
    # remove users with similarity less than threshold
    user_list = [user for user in user_list if row_rating[user] > threshold]
    return user_list

def find_not_rated_books(userID: str):
    not_rated_books = rating_matrix.loc[userID][rating_matrix.loc[userID].isnull()].index
    return list(not_rated_books)


# this function predicts the rating of the missing values in the rating matrix for a given user given a list of similar users
def predict_rating_user(userID: str , userIDs: list):
    # get the not rated books by the user
    not_rated_books = find_not_rated_books(userID)
    # dictionary to store the predicted rating for each book
    prediction_dict = {}
    for book in not_rated_books:
        numerator = 0
        denominator = 0
        for user in userIDs:
            if not np.isnan(rating_matrix.loc[user][book]):
                numerator += user_pearson_similarity_matrix.loc[userID][user] * mean_centered_matrix.loc[user][book]
                denominator += user_pearson_similarity_matrix.loc[userID][user]
        if denominator == 0:
            prediction_dict[book] = 0
        else:
            prediction_dict[book] = numerator / denominator + rating_matrix.loc[userID].mean()
    
    # sort the dictionary by the predicted rating in descending order
    prediction_dict = {i: v for i, v in sorted(prediction_dict.items(), key=lambda item: item[1], reverse=True)}
    return prediction_dict



#### User-based collaborative filtering test

In [6]:
top_users_percent = 0.35
mylist = similar_users("U3", top_users_percent)
user_based_prediction = predict_rating_user("U3", mylist)
print("recommended books for user are: (Item: Rating Prediction)")
print(user_based_prediction)



recommended books for user are: (Item: Rating Prediction)
{'I1': 2.9066666666666663, 'I4': 2.1066666666666665}


## Item-based collaborative filtering

In [7]:
item_pearson_similarity_matrix = rating_matrix.copy()
# use corr function to calculate pearson correlation between users (columns)
item_pearson_similarity_matrix = item_pearson_similarity_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)
item_pearson_similarity_matrix = item_pearson_similarity_matrix.corr(method="pearson")
print(item_pearson_similarity_matrix)






Items        I1        I2        I3        I4        I5
Items                                                  
I1     1.000000  0.508223 -0.508223 -0.614286 -0.583212
I2     0.508223  1.000000  0.250000 -0.711512 -0.806872
I3    -0.508223  0.250000  1.000000  0.237171 -0.161374
I4    -0.614286 -0.711512  0.237171  1.000000  0.285774
I5    -0.583212 -0.806872 -0.161374  0.285774  1.000000


In [8]:
def similar_items(itemID: str, top_items_number: int = len(rating_matrix.columns), threshold: float = 0.2):
    index = rating_matrix.columns.get_loc(itemID)
    coloumn_rating = item_pearson_similarity_matrix.iloc[index]
    coloumn_rating.sort_values(ascending=False, inplace=True)
    top_items = min(len(item_pearson_similarity_matrix), top_items_number)
    item_list = list(coloumn_rating.index[1:top_items+1])
    # remove items with similarity less than threshold
    item_list = [item for item in item_list if coloumn_rating[item] > threshold]
    return item_list



def predict_rating_item(userID: str):
    not_rated_books = find_not_rated_books(userID)
    # dictionary to store the predicted rating for each book
    prediction_dict = {}
    top_items_number = round(sqrt(len(rating_matrix.columns)))

    for book in not_rated_books:
        similar_books = similar_items(book, top_items_number)
        # print("similar books for book {} are:".format(book))
        # print(similar_books)
        numerator = 0
        denominator = 0
        for item in similar_books:
            if not np.isnan(rating_matrix.loc[userID][item]):
                numerator += item_pearson_similarity_matrix.loc[book][item] * rating_matrix.loc[userID][item]
                denominator += item_pearson_similarity_matrix.loc[book][item]
                # print("denominator: {}".format(denominator))
        if denominator == 0:
            prediction_dict[book] = 0
        else:
            prediction_dict[book] = numerator / denominator
        # print("----------------------------------------")
    # sort the dictionary by the predicted rating in descending order
    prediction_dict = {i: v for i, v in sorted(prediction_dict.items(), key=lambda item: item[1], reverse=True)}
    return prediction_dict

#### Item-based collaborative filtering test

In [9]:
# case item searched, with no clue who the user is
item_prediction = similar_items("I1")
print(item_prediction)

# case user logged in
item_based_prediction = predict_rating_item("U3")
print(item_based_prediction)




['I2']
{'I1': 4.0, 'I4': 1.9070590341216882}


## Final predictions & statistics

In [10]:
user_mean = rating_matrix.loc["U3"].mean()
print("user mean: {}".format(user_mean))

## calculating average prediction from item and user CF
average_prediction = {}
for item in item_based_prediction:
    average_prediction[item] = (item_based_prediction[item] + user_based_prediction[item]) / 2

print(average_prediction)

# calculating the RMSE
def rmse(prediction_dict: dict, user_mean: float):
    rmse = 0
    for item in prediction_dict:
        rmse += (prediction_dict[item] - user_mean) ** 2
    rmse = sqrt(rmse / len(prediction_dict))
    return rmse


print("RMSE for user based CF: {}".format(rmse(user_based_prediction, user_mean)))
print("RMSE for item based CF: {}".format(rmse(item_based_prediction, user_mean)))
print("RMSE for average prediction: {}".format(rmse(average_prediction, user_mean)))




user mean: 2.6666666666666665
{'I1': 3.453333333333333, 'I4': 2.0068628503941772}
RMSE for user based CF: 0.4308131845707603
RMSE for item based CF: 1.0850763874489124
RMSE for average prediction: 0.7260115427499019
