<a href="https://www.kaggle.com/code/hazemegy/movie-recommendation-with-svd-based-on-ratings?scriptVersionId=186086083" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD, dump
from surprise.model_selection import train_test_split, cross_validate
from sklearn.metrics import r2_score

# Load the dataset
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
movies = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')

In [2]:
display(ratings)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [3]:
import pandas as pd
from surprise import Dataset, Reader

# Define the rating scale and load data into Surprise format
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [4]:
from surprise import SVD

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)

# Train the SVD algorithm on the training set
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7898e0de64a0>

In [5]:
from surprise import dump

# Save the trained model to a file
dump.dump('svd_model', algo=svd)

In [6]:
# Load the trained model from the file
_, loaded_model = dump.load('svd_model')

In [7]:
# Function to get top N recommendations
def get_top_n_recommendations(user_id, n=10):
    # Get a list of all movie IDs
    all_movie_ids = movies['movieId'].unique()
    
    # Get a list of movie IDs the user has already rated
    rated_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].unique()
    
    # Create a list to store predicted ratings
    predictions = []
    
    # Predict ratings for all movies the user hasn't rated yet
    for movie_id in all_movie_ids:
        if movie_id not in rated_movie_ids:
            prediction = loaded_model.predict(user_id, movie_id)
            predictions.append((movie_id, prediction.est))
    
    # Sort the predictions by rating in descending order and select the top n
    top_n_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    
    # Get the movie titles for the top n recommendations
    top_n_movie_ids = [pred[0] for pred in top_n_predictions]
    top_n_movies = movies[movies['movieId'].isin(top_n_movie_ids)]
    
    return top_n_movies[['movieId', 'title', 'genres']]

# Example usage: Get top 10 recommendations for a specific user
user_id = 1  # Replace with actual user ID
top_n_recommendations = get_top_n_recommendations(user_id, n=10)
display(top_n_recommendations)

Unnamed: 0,movieId,title,genres
2486,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
8532,25987,"Crucified Lovers, The (Chikamatsu monogatari) ...",Drama
12354,57069,Equinox Flower (Higanbana) (1958),Comedy|Drama
12419,57772,World on a Wire (Welt am Draht) (1973),Crime|Sci-Fi
12502,58376,Zeitgeist: The Movie (2007),Documentary|War
16191,81834,Harry Potter and the Deathly Hallows: Part 1 (...,Action|Adventure|Fantasy|IMAX
17499,88125,Harry Potter and the Deathly Hallows: Part 2 (...,Action|Adventure|Drama|Fantasy|Mystery|IMAX
20540,100553,Frozen Planet (2011),Documentary
21517,104374,About Time (2013),Drama|Fantasy|Romance
24725,116897,Wild Tales (2014),Comedy|Drama|Thriller


In [9]:
# Collect actual and predicted ratings for R² calculation
y_true = []
y_pred = []

for uid, mid, true_r, est, _ in loaded_model.test(testset):
    y_true.append(true_r)
    y_pred.append(est)

# Calculate R²
r2 = r2_score(y_true, y_pred)
print(f'R²: {r2:.4f}')

R²: 0.4375
