In [9]:

import pandas as pd

In [12]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [14]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()

In [16]:
num_users

7120

In [17]:
num_movies

14026

In [18]:

rating_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)


In [19]:
rating_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7116,4.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7117,4.0,0.0,4.0,0.0,0.0,5.0,3.0,0.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7119,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


model = SVD()


cross_validate(model, data, cv=3, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8426  0.8466  0.8457  0.8450  0.0017  
MAE (testset)     0.6460  0.6485  0.6480  0.6475  0.0011  
Fit time          6.71    7.22    7.20    7.04    0.24    
Test time         2.41    2.57    2.22    2.40    0.14    


{'test_rmse': array([0.84262878, 0.84663439, 0.8457131 ]),
 'test_mae': array([0.64600526, 0.648469  , 0.64799195]),
 'fit_time': (6.707607269287109, 7.218333721160889, 7.201330661773682),
 'test_time': (2.4128408432006836, 2.5713422298431396, 2.218877077102661)}

In [22]:
# Filter movies rated 5 stars by user id 5
ratings_1 = ratings[ratings['userId'] == 5]
ratings_1 = ratings_1[ratings_1['rating'] == 5]

# Merge with movies dataset to get movie titles
ratings_1 = pd.merge(ratings_1, movies, on='movieId')
print(ratings_1[['movieId', 'title']])


    movieId                                              title
0        11                     American President, The (1995)
1        62                          Mr. Holland's Opus (1995)
2       141                               Birdcage, The (1996)
3       150                                   Apollo 13 (1995)
4       260          Star Wars: Episode IV - A New Hope (1977)
5       318                   Shawshank Redemption, The (1994)
6       364                              Lion King, The (1994)
7       368                                    Maverick (1994)
8       377                                       Speed (1994)
9       380                                   True Lies (1994)
10      440                                        Dave (1993)
11      454                                   Firm, The (1993)
12      457                               Fugitive, The (1993)
13      500                              Mrs. Doubtfire (1993)
14      508                                Philadelphia

In [24]:
# Create a shallow copy of movies dataset
user_5 = movies.copy()
user_5

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [27]:
# Train the model on the entire dataset
trainset = data.build_full_trainset()
model.fit(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x19dba3ca110>

In [28]:
# Predict ratings for user id 5
movie_ids = movies['movieId'].tolist()
predictions = [model.predict(uid=5, iid=mid) for mid in movie_ids]

# Convert predictions to a DataFrame
pred_df = pd.DataFrame([(pred.iid, pred.est) for pred in predictions], columns=['movieId', 'predicted_rating'])


In [29]:
pred_df

Unnamed: 0,movieId,predicted_rating
0,1,4.510682
1,2,3.495466
2,3,3.668443
3,4,3.482228
4,5,3.798075
...,...,...
27273,131254,4.016275
27274,131256,4.016275
27275,131258,4.016275
27276,131260,4.016275


In [26]:
# Merge predictions with movie titles
top_predictions = pd.merge(pred_df, movies, on='movieId')

# Get top 10 movie recommendations
top_10_recommendations = top_predictions.sort_values(by='predicted_rating', ascending=False).head(10)
print(top_10_recommendations[['title', 'predicted_rating']])


                                                   title  predicted_rating
1173   Raiders of the Lost Ark (Indiana Jones and the...          5.000000
6873   Passion of Joan of Arc, The (Passion de Jeanne...          5.000000
352                                  Forrest Gump (1994)          5.000000
523                              Schindler's List (1993)          4.976203
360                                Lion King, The (1994)          4.967658
1263           Indiana Jones and the Last Crusade (1989)          4.946493
2416                                  October Sky (1999)          4.907460
17874                               Avengers, The (2012)          4.906369
1557                    Hunt for Red October, The (1990)          4.899274
257            Star Wars: Episode IV - A New Hope (1977)          4.898929
