In [1]:
#libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
#load df
df_train = pd.read_csv('../data/train.csv')

book_ids = df_train['book_id'].unique()
user_ids = df_train['user_id'].unique()

nb_books = len(book_ids)
nb_users = len(user_ids)

nb_books, nb_users

(15712, 18905)

In [3]:
book_idx = {ids: i for i,ids in enumerate(book_ids)}
user_idx = {ids: i for i,ids in enumerate(user_ids)}

In [4]:

X = np.zeros((nb_users, nb_books))

#inspired from code in week 5 exercise
for line in df_train.itertuples():
    X[user_idx[line[2]], book_idx[line[1]]] = line[3]


In [5]:
item_similarity = 1-pairwise_distances(X.T, metric='cosine')
user_similarity = 1-pairwise_distances(X, metric='cosine')

In [6]:
df_test = pd.read_csv('../data/test.csv')
test_book = df_test['book_id'].values
test_user = df_test['user_id'].values

In [7]:
def item_based_predict(train_data_matrix, item_similarity):
    # Calculate the numerator (weighted sum of ratings) for all users and items at once
    numerator = train_data_matrix @ item_similarity
    
    # Calculate the denominator (sum of absolute similarities) for all items at once
    denominator = np.abs(item_similarity).sum(axis=1)
    
    # Avoid division by zero by setting zero denominators to NaN temporarily
    denominator = np.where(denominator == 0, np.nan, denominator)
    
    # Divide each user's weighted sum by the sum of similarities (broadcasting the denominator)
    filled_matrix = numerator / denominator
    
    # Replace NaNs (from zero-denominator cases) with random integer ratings between 1 and 5
    filled_matrix = np.where(np.isnan(filled_matrix), np.random.uniform(1, 5, size=filled_matrix.shape), filled_matrix)
    
    return filled_matrix

X_predict_item_filtering = item_based_predict(X, item_similarity)

In [9]:
def user_based_predict(ratings, user_similarity):
    # Copy ratings matrix and replace zero values with NaN for averaging
    tmp = ratings.copy()
    tmp[tmp == 0] = np.nan
    user_average_ratings = np.nanmean(tmp, axis=1)

    # Center the ratings by subtracting user averages
    centered_ratings = (ratings - user_average_ratings[:, None])
    centered_ratings[np.isnan(centered_ratings)] = 0  # Replace NaN with 0 for multiplication
    
    # Compute the weighted sum of centered ratings using user similarity
    numerator = user_similarity @ centered_ratings
    
    # Compute the denominator (sum of absolute similarities)
    denominator = np.abs(user_similarity).sum(axis=1, keepdims=True)
    
    # Avoid division by zero by setting zero denominators to NaN temporarily
    denominator = np.where(denominator == 0, np.nan, denominator)
    
    # Compute the filled matrix by adding back user average ratings
    filled_matrix = user_average_ratings[:, None] + numerator / denominator
    
    # Replace NaNs (from zero-denominator cases) with the user's average rating
    filled_matrix = np.where(np.isnan(filled_matrix), user_average_ratings[:, None], filled_matrix)
    
    # Ensure ratings are within the expected range (0 to 5)
    filled_matrix = np.clip(filled_matrix, 0, 5)
    
    return filled_matrix

X_predict_user_filtering = user_based_predict(X, user_similarity)

In [10]:
nb_test = len(df_test)
predictions_item = [0]*nb_test
for i in range(nb_test):
    predictions_item[i] = X_predict_item_filtering[user_idx[test_user[i]], book_idx[test_book[i]]]

In [11]:
nb_test = len(df_test)
predictions_user = [0]*nb_test
for i in range(nb_test):
    predictions_user[i] = X_predict_user_filtering[user_idx[test_user[i]], book_idx[test_book[i]]]

In [13]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()

    return sqrt(mean_squared_error(prediction, ground_truth))


rmse(X_predict_user_filtering, X)

1.7798997581196616

In [14]:
predictions_user = np.clip(predictions_user, 0, 5)
predictions_item = np.clip(predictions_item, 0, 5)

submission_user = pd.DataFrame({
            'id':range(len(df_test)),
            'rating': predictions_user
        })
submission_user.to_csv('./submission_user_filtering.csv', index=False)


submission_item = pd.DataFrame({
            'id':range(len(df_test)),
            'rating': predictions_item
        })
submission_item.to_csv('./submission_item_filtering.csv', index=False)
