# Collaborative Filtering -- Item Based Collaborative Filtering with Single Value Decomposition

https://surprise.readthedocs.io/en/stable/FAQ.html

https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system

In [12]:
import pandas as pd 
import numpy as np 
import os

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from collections import defaultdict
from surprise import Reader
from surprise import accuracy

In [13]:
#import data
# df_movies=pd.read_csv('ml-25m/movies.csv')
# df_tags=pd.read_csv('ml-25m/tags.csv')
df_ratings=pd.read_csv('../db/movieLens/processed/rating.csv')
# df_links=pd.read_csv('ml-25m/links.csv')
# df_gscore=pd.read_csv('ml-25m/genome-scores.csv')
# df_gtag=pd.read_csv('ml-25m/genome-tags.csv')

## Traning Data praperation

In [14]:
# Sprise library requires a reader to transform data format
reader = Reader(rating_scale=(1, 5.0))

# calculation is too slow, only use half data, which are the ratings from the first 4838 users 
# df_ratings_half = df_ratings.head(711900)
# data = Dataset.load_from_df(df_ratings_half[['userId', 'movieId', 'rating']], reader) 

# Full data
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

In [15]:
# Use the famous SVD algorithm, can be tunned later
algo = SVD()

## Model Traing
Haven't save the model since it is an object, but I checked the documentation of the suprise library, we can reduce the size of training data. In their documentation, training with 100000 ratings by 943 users on 1682 items costs 11s. (We can achieve online training by reducing the size of training data?)

Here, the df_ratings_half contains half of the rating data(711899 rating) from 4838 users, which cost 33.2 s to train the model. I saved the model trained on the whole rating data, which cost 22mins.

In [16]:
%%time
trainset = data.build_full_trainset()
algo.fit(trainset)

CPU times: user 3.32 s, sys: 11.5 ms, total: 3.34 s
Wall time: 3.34 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff324e46a00>

## Testing data Praperation

In [17]:
df_ratings_test = df_ratings[df_ratings['userId'] == 100]

In [18]:
test = Dataset.load_from_df(df_ratings_test[['userId', 'movieId', 'rating']], reader)
testset1 = test.build_full_trainset()
testset = testset1.build_testset()

In [19]:
testset

[(100, 1, 4.0),
 (100, 3, 4.0),
 (100, 6, 3.0),
 (100, 7, 3.0),
 (100, 25, 4.0),
 (100, 32, 5.0),
 (100, 52, 3.0),
 (100, 62, 3.0),
 (100, 86, 3.0),
 (100, 88, 2.0),
 (100, 95, 3.0),
 (100, 135, 3.0),
 (100, 141, 3.0),
 (100, 608, 4.0),
 (100, 648, 3.0),
 (100, 661, 3.0),
 (100, 708, 3.0),
 (100, 733, 3.0),
 (100, 736, 3.0),
 (100, 745, 4.0),
 (100, 780, 3.0),
 (100, 786, 3.0),
 (100, 802, 4.0),
 (100, 1073, 5.0),
 (100, 1356, 4.0)]

## Make prediction

In [20]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    print(top_n)
    return top_n

In [21]:
predictions = algo.test(testset)

# Then compute RMSE
# accuracy.rmse(predictions)

top_n = get_top_n(predictions, n=3)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

defaultdict(<class 'list'>, {100: [(745, 4.071715495733882), (608, 4.014521879030427), (32, 3.8273643331855993)]})
100 [745, 608, 32]


In [22]:
top_n

defaultdict(list,
            {100: [(745, 4.071715495733882),
              (608, 4.014521879030427),
              (32, 3.8273643331855993)]})