In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import SVD, SVDpp, NMF

# Data preparation

In [2]:
path = "data/"
items = ["links", "movies", "ratings", "tags"]
links = pd.read_csv(path + items[0] + ".csv")
movies = pd.read_csv(path + items[1] + ".csv")
ratings = pd.read_csv(path + items[2] + ".csv")
tags = pd.read_csv(path + items[3] + ".csv")


In [3]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

trainset, testset = train_test_split(data, test_size=0.1)

# Algoritms test

In [4]:
algo_svd = SVD()
algo_svd.fit(trainset)
results_svd = cross_validate(algo_svd, data, measures=["RMSE"], cv=5, verbose=True)

algo_svdpp = SVDpp()
algo_svdpp.fit(trainset)
results_svdpp = cross_validate(algo_svdpp, data, measures=["RMSE"], cv=5, verbose=True)

algo_nmf = NMF()
algo_nmf.fit(trainset)
results_nmf = cross_validate(algo_nmf, data, measures=["RMSE"], cv=5, verbose=True)


Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8703  0.8731  0.8705  0.8717  0.8788  0.8729  0.0031  
Fit time          0.61    0.68    0.67    0.61    0.62    0.64    0.03    
Test time         0.10    0.10    0.11    0.05    0.10    0.09    0.02    
Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8595  0.8611  0.8670  0.8641  0.8610  0.8625  0.0027  
Fit time          58.25   58.55   58.48   58.34   58.88   58.50   0.22    
Test time         5.50    5.18    4.99    5.72    5.57    5.39    0.27    
Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9216  0.9164  0.9273  0.9202  0.9173  0.9206  0.0038  
Fit time          1.16    1.37    1.33    1.17    1.16    1.24    0.09    
Test time         0.05    

## Example of usage

In [5]:
def recommend_movies_by_genre(algo, user_id, num_recommendations=5):
    # Movies rated by user
    user_ratings = ratings[ratings['userId'] == user_id]
    user_rated_movies = user_ratings['movieId'].tolist()

    # Genres rated by user
    user_genres = movies[movies['movieId'].isin(user_rated_movies)]['genres']
    genre_counts = user_genres.str.split('|').explode().value_counts()

    # Movies not rated by user
    unrated_movies = movies[~movies['movieId'].isin(user_rated_movies)].copy()

    def genre_score(movie_genres):
        movie_genres_list = movie_genres.split('|')
        score = sum([genre_counts.get(genre, 0) * (5 - idx) for idx, genre in enumerate(movie_genres_list)])
        return score

    unrated_movies['genre_score'] = unrated_movies['genres'].apply(genre_score)

    top_genre_movies = unrated_movies.sort_values(by='genre_score', ascending=False).head(num_recommendations * 10)

    top_genre_movies['predicted_rating'] = top_genre_movies['movieId'].apply(lambda x: algo.predict(user_id, x).est)

    top_movies = top_genre_movies.sort_values(by=['genre_score', 'predicted_rating'], ascending=False).head(num_recommendations)

    favorite_genres = genre_counts.head(5)

    return favorite_genres, top_movies[['movieId', 'title', 'genres', 'predicted_rating']]

In [6]:
for algo in [algo_svd, algo_svdpp, algo_nmf]:
    print(algo.__class__.__name__)
    favorite_genres, recommendations = recommend_movies_by_genre(algo, user_id=50, num_recommendations=5)
    print("Favourite genres:")
    print(favorite_genres)
    print("\nRecommendations by favourite genres:")
    print(recommendations)

SVD
Favourite genres:
genres
Drama        126
Comedy       121
Action        81
Adventure     80
Thriller      57
Name: count, dtype: int64

Recommendations by favourite genres:
      movieId                                  title  \
6570    55116              Hunting Party, The (2007)   
8597   117646  Dragonheart 2: A New Beginning (2000)   
6219    45672                           Click (2006)   
7111    70728                         Bronson (2009)   
6765    59947                  Protector, The (1985)   

                                              genres  predicted_rating  
6570          Action|Adventure|Comedy|Drama|Thriller          2.458624  
8597  Action|Adventure|Comedy|Drama|Fantasy|Thriller          2.805243  
6219          Adventure|Comedy|Drama|Fantasy|Romance          2.427357  
7111                    Action|Comedy|Drama|Thriller          2.723364  
6765                    Action|Comedy|Drama|Thriller          2.397964  
SVDpp
Favourite genres:
genres
Drama        126

# Summary

The SVD++ algorithm achieved the lowest RMSE, indicating the highest prediction accuracy among the three methods. However, when it comes to computational performance, both SVD++ and NMF were significantly better, with SVD++ being particularly computationally intensive due to its complexity.

Despite the superior accuracy of SVD++, the SVD algorithm demonstrated the best balance between computational cost and prediction quality. SVD provided relatively low RMSE values while requiring less computational time compared to SVD++.

In conclusion, while SVD++ offers the best accuracy, the SVD algorithm is the optimal choice when considering the trade-off between computational cost and prediction quality, making it the most efficient algorithm in terms of both performance and resource usage.