# Colloborative filtering workflow

In [1]:
# preprocess and save data for collaborative filtering modelling
from data_script.preprocess_collaborative import hamid_user_id, sveta_user_id

Creating dataframes...
Filtering movies and users...
Out of total of  10370  movies,  8338  are considered rare and will be removed.
The final number of movies is  2032
Out of total of  138493  users,  138046  are considered lazy and will be removed.
The final number of users is  447
Adding Sviatlana's and Hamid's ratings...
Sveta 's user id is 110597
Hamid 's user id is 102850
Creating the pivot matrix...
The resulting sparcity of the matrix is: 0.4620430410293578
Preparing the final rating matrix...
Data preprocesssing for collaborative filtering modeling is completed!


In [2]:
from model.CollaborativeFilteringRec import cf_model

In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict

from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly, Dataset, Reader, SVD, KNNBasic, KNNBaseline, KNNWithMeans

from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split

#Get the final ratings matrix
print("Getting the ratings matrix...")
ratings = pd.read_csv('./data/processed/final_ratings.csv')
movie_df = pd.read_csv('./data/external/movies.csv')

#Prepare data in the Surprise's format
print("Preparing data in the Suprise format...")
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

Getting the ratings matrix...
Preparing data in the Suprise format...


In [4]:
#Prepare train and test data (what about validation?)
trainset, testset = train_test_split(data, test_size=.25, random_state=42)

In [5]:
#KNN-based memory based model
sim_options = {'name': 'msd',
               'min_support': 5,
               'user_based': False}

knn_basic = KNNBasic(k=30,sim_options=sim_options)

In [6]:
knn_basic_model = cf_model(knn_basic, trainset, testset, data)
knn_basic_model.fit_and_predict()
knn_basic_model.cross_validate()
knn_basic_model.recommend(sveta_user_id, 20).merge(movie_df, on='movieId', how='left') 

Fitting the train data...
Computing the msd similarity matrix...
Done computing similarity matrix.
Predicting the test data...
RMSE: 0.7648
RMSE for the predicted result is 0.765
Cross Validating the data...
Mean CV RMSE is 0.759
Recommending top 20 products for userid : 110597 ...


Unnamed: 0,userId,movieId,rating,title,genres
0,110597,1206,4.467957,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
1,110597,541,4.405479,Blade Runner (1982),Action|Sci-Fi|Thriller
2,110597,527,4.288721,Schindler's List (1993),Drama|War
3,110597,58559,4.279162,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
4,110597,4973,4.20484,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance
5,110597,4993,4.196493,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
6,110597,5952,4.117202,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
7,110597,48385,4.011684,Borat: Cultural Learnings of America for Make ...,Comedy
8,110597,1036,3.985201,Die Hard (1988),Action|Crime|Thriller
9,110597,4011,3.98019,Snatch (2000),Comedy|Crime|Thriller


In [17]:
knn_basic_model.recommend(hamid_user_id, 20).merge(movie_df, on='movieId', how='left') 

Recommending top 20 products for userid : 102850 ...


Unnamed: 0,userId,movieId,rating,title,genres
0,102850,924,4.222926,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi
1,102850,923,4.16415,Citizen Kane (1941),Drama|Mystery
2,102850,2010,4.151352,Metropolis (1927),Drama|Sci-Fi
3,102850,2019,4.111878,Seven Samurai (Shichinin no samurai) (1954),Action|Adventure|Drama
4,102850,593,4.09561,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
5,102850,1228,4.084757,Raging Bull (1980),Drama
6,102850,1217,4.052776,Ran (1985),Drama|War
7,102850,1206,4.052221,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
8,102850,3462,4.027317,Modern Times (1936),Comedy|Drama|Romance
9,102850,1211,4.016986,"Wings of Desire (Himmel über Berlin, Der) (1987)",Drama|Fantasy|Romance


In [14]:
sim_options = {'name': 'cosine',
               'min_support': 5,
               'user_based': False}

knn_with_means = KNNWithMeans(k=25,sim_options=sim_options)

In [15]:
knn_with_means_model = cf_model(knn_with_means, trainset, testset, data)
knn_with_means_model.fit_and_predict()
knn_with_means_model.cross_validate()
knn_with_means_model.recommend(sveta_user_id, 20).merge(movie_df, on='movieId', how='left') 

Fitting the train data...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Predicting the test data...
RMSE: 0.7731
RMSE for the predicted result is 0.773
Cross Validating the data...
Mean CV RMSE is 0.768
Recommending top 20 products for userid : 110597 ...


Unnamed: 0,userId,movieId,rating,title,genres
0,110597,202439,5.0,,
1,110597,1206,4.569605,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
2,110597,58559,4.385288,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
3,110597,527,4.348882,Schindler's List (1993),Drama|War
4,110597,541,4.340022,Blade Runner (1982),Action|Sci-Fi|Thriller
5,110597,4993,4.241033,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
6,110597,4973,4.198562,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance
7,110597,5952,4.136885,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
8,110597,1036,4.032884,Die Hard (1988),Action|Crime|Thriller
9,110597,4011,3.909083,Snatch (2000),Comedy|Crime|Thriller


In [16]:
svd = SVD()
svd_model = cf_model(svd, trainset, testset, data)
svd_model.fit_and_predict()
svd_model.cross_validate()
svd_model.recommend(sveta_user_id, 20).merge(movie_df, on='movieId', how='left') 

Fitting the train data...
Predicting the test data...
RMSE: 0.7417
RMSE for the predicted result is 0.742
Cross Validating the data...
Mean CV RMSE is 0.737
Recommending top 20 products for userid : 110597 ...


Unnamed: 0,userId,movieId,rating,title,genres
0,110597,58559,4.849629,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
1,110597,527,4.836052,Schindler's List (1993),Drama|War
2,110597,1206,4.563817,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
3,110597,1036,4.445824,Die Hard (1988),Action|Crime|Thriller
4,110597,541,4.382573,Blade Runner (1982),Action|Sci-Fi|Thriller
5,110597,3147,4.28614,"Green Mile, The (1999)",Crime|Drama
6,110597,4973,4.239556,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance
7,110597,48385,4.201484,Borat: Cultural Learnings of America for Make ...,Comedy
8,110597,4011,4.142976,Snatch (2000),Comedy|Crime|Thriller
9,110597,4720,4.118383,"Others, The (2001)",Drama|Horror|Mystery|Thriller


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
