In [4]:
import nltk
import math
import json
import pickle
import os
import string
import heapq
import time
import ast
import csv

import pandas as pd
import numpy as np


Source inspiration: https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

In [5]:
train_ratings_df = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems/train_ratings.csv', header=0)

In [6]:
train_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,509,7347,3.0,1435994597
1,326,71462,4.0,1322252335
2,57,2115,3.0,965798155
3,610,1127,4.0,1479544102
4,462,2409,2.0,1174438249


In [8]:
import numpy as np

user_id_map = {}  # Maps original user IDs to consecutive integers
item_id_map = {}  # Maps original item IDs to consecutive integers

def create_data_matrix(data, n_users, n_items):
    
    for line in data.itertuples():
        user_id, item_id, rating = line[1], line[2], line[3]

        # Create or retrieve a unique consecutive integer for the user and item
        if user_id not in user_id_map:
            user_id_map[user_id] = len(user_id_map)
        if item_id not in item_id_map:
            item_id_map[item_id] = len(item_id_map)

    data_matrix = np.zeros((n_users, n_items))

    for line in data.itertuples():
        user_id, item_id, rating = line[1], line[2], line[3]
        user_idx, item_idx = user_id_map[user_id], item_id_map[item_id]
        data_matrix[user_idx, item_idx] = rating

    return data_matrix

# Example usage:
n_users = train_ratings_df["userId"].nunique()
n_items = train_ratings_df["movieId"].nunique()

train_data_matrix = create_data_matrix(train_ratings_df, n_users, n_items)


In [9]:
train_data_matrix

array([[3., 0., 0., ..., 0., 0., 0.],
       [0., 4., 0., ..., 0., 0., 0.],
       [0., 0., 3., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
def similarity(ratings, epsilon=1e-9):
    sim = ratings.dot(ratings.T) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [12]:
user_similarity = similarity(train_data_matrix)
print(user_similarity[:4, :4])

[[1.         0.06614519 0.11063292 0.20742591]
 [0.06614519 1.         0.05894987 0.14144961]
 [0.11063292 0.05894987 1.         0.12721823]
 [0.20742591 0.14144961 0.12721823 1.        ]]


In [16]:
def predict(ratings, similarity):
    return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T

In [17]:
prediction = predict(train_data_matrix, user_similarity)

In [18]:
prediction

array([[2.05089431e-01, 3.71876021e-02, 7.54057547e-01, ...,
        1.74041964e-02, 8.41945411e-03, 1.50313887e-02],
       [1.20704981e-01, 1.15232776e-01, 5.46078848e-01, ...,
        7.91609520e-03, 1.01740153e-02, 1.06419858e-02],
       [1.09825862e-01, 2.15415825e-02, 8.63164250e-01, ...,
        7.41215328e-03, 3.48586462e-03, 8.72275068e-03],
       ...,
       [5.23592042e-02, 1.35235308e-02, 4.11362388e-01, ...,
        5.27397564e-03, 2.65239183e-03, 5.63604570e-03],
       [9.59104127e-02, 2.79167654e-02, 6.04960228e-01, ...,
        7.88365489e-03, 1.45876692e-12, 8.33550678e-03],
       [1.17223142e-01, 8.71271391e-03, 8.48668733e-01, ...,
        5.78439983e-03, 8.42224770e-03, 9.66882501e-03]])

Using only topK similar movies

In [21]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
   
    for i in range(ratings.shape[0]):
        top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
        for j in range(ratings.shape[1]):
            pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users].T) 
            pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))     
    
    return pred

In [None]:
pred = predict_topk(train_data_matrix, user_similarity, k=40)
print('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))

In [None]:
user_idx = user_id_map.get(326)
movie_idx = item_id_map.get(71462)

# Get the predicted rating for the target user and item
predicted_rating = pred[user_idx, movie_idx]
print(predicted_rating)