Model-Based Collaborative Filtering: Matrix Factorization

In [2]:
#!pip install surprise
import pandas as pd
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
pd.set_option('display.max_columns', None)

 Preparing the Data Set

In [3]:
movie = pd.read_csv('/kaggle/input/recommendation-system/ml-25m/movies.csv')
rating = pd.read_csv('/kaggle/input/recommendation-system/ml-25m/ratings.csv')
df = movie.merge(rating, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,3.5,1141416000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1439472000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,3.0,1573944000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,858625900.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,890492500.0


In [4]:
movie_ids = [130219, 356, 4422, 541]
movies = ["The Dark Knight (2011)",
          "Cries and Whispers (Viskningar och rop) (1972)",
          "Forrest Gump (1994)",
          "Blade Runner (1982)"]

In [5]:
sample_df = df[df.movieId.isin(movie_ids)]
sample_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2466478,356,Forrest Gump (1994),Comedy|Drama|Romance|War,2.0,4.5,1141417000.0
2466479,356,Forrest Gump (1994),Comedy|Drama|Romance|War,3.0,4.0,1439472000.0
2466480,356,Forrest Gump (1994),Comedy|Drama|Romance|War,5.0,4.0,833146700.0
2466481,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,890489600.0
2466482,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,5.0,859383400.0


In [6]:
sample_df.shape

(118979, 6)

In [7]:
user_movie_df = sample_df.pivot_table(index=["userId"],
                                      columns=["title"],
                                      values="rating")

In [8]:
reader = Reader(rating_scale=(1, 5))

In [9]:
data = Dataset.load_from_df(sample_df[['userId',
                                       'movieId',
                                       'rating']], reader)

Modeling

In [10]:
trainset, testset = train_test_split(data, test_size=.25)
svd_model = SVD()
svd_model.fit(trainset)
predictions = svd_model.test(testset)

In [11]:
svd_model.predict(uid=3.0, iid=4422, verbose=True)
svd_model.predict(uid=3.0, iid=541, verbose=True)
sample_df[sample_df["userId"] == 3]

user: 3.0        item: 4422       r_ui = None   est = 4.04   {'was_impossible': False}
user: 3.0        item: 541        r_ui = None   est = 3.98   {'was_impossible': False}


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2466479,356,Forrest Gump (1994),Comedy|Drama|Romance|War,3.0,4.0,1439472000.0
3614459,541,Blade Runner (1982),Action|Sci-Fi|Thriller,3.0,5.0,1439475000.0


In [12]:
accuracy.rmse(predictions)

RMSE: 0.9343


0.9343183062043834

Model Tuning

In [13]:
param_grid = {'n_epochs': [5, 10, 20, 18, 15],
              'lr_all': [0.001, 0.002, 0.0015, 0.005, 0.007]}

In [14]:
gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)
gs.fit(data)
gs.best_score['rmse']
gs.best_params['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   43.0s finished


{'n_epochs': 15, 'lr_all': 0.001}

Final Model and Prediction

In [15]:
dir(svd_model)
svd_model.n_epochs
svd_model = SVD(**gs.best_params['rmse'])
data = data.build_full_trainset()
svd_model.fit(data)
svd_model.predict(uid=3.0, iid=541, verbose=True)

user: 3.0        item: 541        r_ui = None   est = 4.27   {'was_impossible': False}


Prediction(uid=3.0, iid=541, r_ui=None, est=4.265594767375949, details={'was_impossible': False})