# Colloborative filtering workflow

In [1]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict

from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly, Dataset, Reader, SVD, KNNBasic, KNNBaseline, KNNWithMeans, dump

from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split

from model.CollaborativeFilteringRec import cf_model

from evaluation.Evaluate import Evaluate

## Data preprocessing

In [2]:
# preprocess and save data for collaborative filtering modelling
from data_script.preprocess_collaborative import hamid_user_id, sveta_user_id

Creating dataframes...
Filtering users...
Out of total of  138493  users,  138266  are considered lazy and will be removed.
The final number of users is  227
Adding Sviatlana's and Hamid's ratings... we are lazy users, but still...
Sveta 's user id is 92024
Hamid 's user id is 105352
Adding anti-Sviatlana's and anti-Hamid's ratings... they are lazy users, but still...
anti-Sveta 's user id is 117575
anti-Hamid 's user id is 119550
Now, the final number of users is  231
Filtering movies...
Out of total of  10383  movies,  8478  are considered rare and will be removed.
The final number of movies is  1905
Creating the pivot matrix...
The resulting sparcity of the matrix is: 0.3586460783311177
Preparing the final rating matrix...
Data preprocesssing for collaborative filtering modeling is completed!


In [3]:
sveta_user_id = 92024
hamid_user_id = 105352
anti_sveta_id = 117575
anti_hamid_id = 119550

In [4]:
#Get the final ratings matrix
print("Getting the ratings matrix...")
ratings = pd.read_csv('./data/processed/final_ratings.csv')
movie_df = pd.read_csv('./data/external/movies.csv')

Getting the ratings matrix...


In [5]:
#Prepare data in the Surprise's format
print("Preparing data in the Suprise format...")
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

#Prepare train and test data (what about validation?)
#trainset, testset = train_test_split(data, test_size=.25, random_state=42)

# For final preditions
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

Preparing data in the Suprise format...


## Calculation of the quality of the collaborative filtering

 **Variety**

In [10]:
knn_basic_preds, knn_basic_algo = dump.load('./model/trained_models/KNNBasic_Model')
knn_basic_model = cf_model(knn_basic_algo, trainset, testset, data, 10, knn_basic_preds)
knn_basic_model.fit_and_predict()
eval_knn_basic = Evaluate(knn_basic_model)
eval_knn_basic.Variety()

RMSE: 0.6760


(0.40996, 0.01603865331005069)

In [6]:
knn_wmeans_preds, knn_wmeans_algo = dump.load('./model/trained_models/KNNWithMeans_Model')
knn_wmeans_model = cf_model(knn_wmeans_algo, trainset, testset, data, 10, knn_wmeans_preds)
knn_wmeans_model.fit_and_predict()
eval_knn_wmeans = Evaluate(knn_wmeans_model)
eval_knn_wmeans.Variety()

RMSE: 0.6965


(0.30147, 0.011564994595761815)

In [7]:
svd_preds, svd_algo = dump.load('./model/trained_models/SVDDefault_Model')
svd_model = cf_model(svd_algo, trainset, testset, data, 10, svd_preds)
svd_model.fit_and_predict()
eval_svd = Evaluate(svd_model)
eval_svd.Variety()

RMSE: 0.7238


(0.38679, 0.014470863830469832)

**Personalization**

In [None]:
# Find pairs of most similar users

## Pre-clustering users

## KNN Basic

In [6]:
#KNN-based memory based model
sim_options = {'name': 'msd',
               'min_support': 3,
               'user_based': False}

knn_basic = KNNBasic(k=25,sim_options=sim_options)

In [7]:
knn_basic_model = cf_model(knn_basic, trainset, testset, data, 20)
knn_basic_model.fit_and_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.6760


0.676

In [7]:
knn_basic_model.cross_validate()

Cross Validating the data...
Mean CV RMSE is 0.767


0.767

In [8]:
dump.dump('model/trained_models/KNNBasic_Model',algo=knn_basic_model,predictions=knn_basic_model.pred_test)

In [9]:
knn_basic_model.recommend_all(sveta_user_id).merge(movie_df, on='movieId', how='left').head(20)

Unnamed: 0,userId,movieId,pred_rating,title,genres
0,92024,1222,4.584248,Full Metal Jacket (1987),Drama|War
1,92024,1997,4.479583,"Exorcist, The (1973)",Horror|Mystery
2,92024,56782,4.463551,There Will Be Blood (2007),Drama|Western
3,92024,3424,4.444007,Do the Right Thing (1989),Drama
4,92024,1276,4.435548,Cool Hand Luke (1967),Drama
5,92024,1213,4.433005,Goodfellas (1990),Crime|Drama
6,92024,1228,4.42208,Raging Bull (1980),Drama
7,92024,2731,4.412642,"400 Blows, The (Les quatre cents coups) (1959)",Crime|Drama
8,92024,1237,4.409852,"Seventh Seal, The (Sjunde inseglet, Det) (1957)",Drama
9,92024,1090,4.40896,Platoon (1986),Drama|War


In [13]:
knn_basic_model.recommend_all(hamid_user_id).merge(movie_df, on='movieId', how='left').head(20)

Unnamed: 0,userId,movieId,pred_rating,title,genres
0,121198,1252,4.320389,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller
1,121198,904,4.31007,Rear Window (1954),Mystery|Thriller
2,121198,3742,4.301861,Battleship Potemkin (1925),Drama|War
3,121198,2132,4.288105,Who's Afraid of Virginia Woolf? (1966),Drama
4,121198,903,4.262193,Vertigo (1958),Drama|Mystery|Romance|Thriller
5,121198,246,4.2518,Hoop Dreams (1994),Documentary
6,121198,3030,4.24033,Yojimbo (1961),Action|Adventure
7,121198,1254,4.237478,"Treasure of the Sierra Madre, The (1948)",Action|Adventure|Drama|Western
8,121198,72226,4.237116,Fantastic Mr. Fox (2009),Adventure|Animation|Children|Comedy|Crime
9,121198,908,4.234339,North by Northwest (1959),Action|Adventure|Mystery|Romance|Thriller


## KNN With Means

In [11]:
sim_options = {'name': 'cosine',
               'min_support': 5,
               'user_based': False}

knn_with_means = KNNWithMeans(k=24,sim_options=sim_options)

In [12]:
knn_with_means_model = cf_model(knn_with_means, trainset, testset, data, 20)
knn_with_means_model.fit_and_predict()

Fitting the train data...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Predicting the test data...
RMSE: 0.6965
RMSE for the predicted result is 0.697


0.697

In [13]:
knn_with_means_model.cross_validate()

Cross Validating the data...
Mean CV RMSE is 0.77


0.77

In [14]:
dump.dump('model/trained_models/KNNWithMeans_Model',algo=knn_with_means_model,predictions=knn_with_means_model.pred_test)

In [14]:
knn_with_means_model.recommend_all(sveta_user_id).merge(movie_df, on='movieId', how='left').head(20)

NameError: name 'knn_with_means_model' is not defined

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [16]:
knn_with_means_model.recommend_all(hamid_user_id).merge(movie_df, on='movieId', how='left').head(20)

All ratings for userid : 121198 ...


Unnamed: 0,userId,movieId,pred_rating,title,genres
0,121198,1254,4.43919,"Treasure of the Sierra Madre, The (1948)",Action|Adventure|Drama|Western
1,121198,3435,4.423601,Double Indemnity (1944),Crime|Drama|Film-Noir
2,121198,904,4.411895,Rear Window (1954),Mystery|Thriller
3,121198,912,4.407544,Casablanca (1942),Drama|Romance
4,121198,1204,4.403957,Lawrence of Arabia (1962),Adventure|Drama|War
5,121198,1212,4.370332,"Third Man, The (1949)",Film-Noir|Mystery|Thriller
6,121198,1207,4.368798,To Kill a Mockingbird (1962),Drama
7,121198,922,4.346298,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama|Film-Noir|Romance
8,121198,1252,4.33974,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller
9,121198,3362,4.337757,Dog Day Afternoon (1975),Crime|Drama


## SVD

In [17]:
# Retrain the model
svd = SVD()
svd_model = cf_model(svd, trainset, testset, data, 20)
svd_model.fit_and_predict()

Fitting the train data...
Predicting the test data...
RMSE: 0.7238
RMSE for the predicted result is 0.724


0.724

In [18]:
svd_model.cross_validate()

Cross Validating the data...
Mean CV RMSE is 0.751


0.751

In [19]:
dump.dump('model/trained_models/SVDDefault_Model',algo=svd_model,predictions=svd_model.pred_test)

In [20]:
svd_model.recommend_all(sveta_user_id).merge(movie_df, on='movieId', how='inner').head(20)

All ratings for userid : 94622 ...


Unnamed: 0,userId,movieId,pred_rating,title,genres
0,94622,912,4.643404,Casablanca (1942),Drama|Romance
1,94622,908,4.578564,North by Northwest (1959),Action|Adventure|Mystery|Romance|Thriller
2,94622,1247,4.519101,"Graduate, The (1967)",Comedy|Drama|Romance
3,94622,904,4.505851,Rear Window (1954),Mystery|Thriller
4,94622,922,4.497562,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama|Film-Noir|Romance
5,94622,750,4.452001,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
6,94622,1617,4.441549,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
7,94622,903,4.435428,Vertigo (1958),Drama|Mystery|Romance|Thriller
8,94622,2858,4.41114,American Beauty (1999),Comedy|Drama
9,94622,2762,4.407439,"Sixth Sense, The (1999)",Drama|Horror|Mystery


In [21]:
svd_model.recommend_all(hamid_user_id).merge(movie_df, on='movieId', how='inner').head(20)

All ratings for userid : 121198 ...


Unnamed: 0,userId,movieId,pred_rating,title,genres
0,121198,904,4.672941,Rear Window (1954),Mystery|Thriller
1,121198,1212,4.555386,"Third Man, The (1949)",Film-Noir|Mystery|Thriller
2,121198,953,4.547739,It's a Wonderful Life (1946),Drama|Fantasy|Romance
3,121198,903,4.491078,Vertigo (1958),Drama|Mystery|Romance|Thriller
4,121198,1252,4.488416,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller
5,121198,2160,4.472871,Rosemary's Baby (1968),Drama|Horror|Thriller
6,121198,1203,4.413382,12 Angry Men (1957),Drama
7,121198,912,4.399907,Casablanca (1942),Drama|Romance
8,121198,3435,4.391734,Double Indemnity (1944),Crime|Drama|Film-Noir
9,121198,2076,4.387187,Blue Velvet (1986),Drama|Mystery|Thriller


## SVD tuned

In [None]:
param_grid = {
    'n_factors': [10, 100, 500],
    'n_epochs': [5, 20, 50], 
    'lr_all': [0.001, 0.005, 0.02],
    'reg_all': [0.005, 0.02, 0.1]}

gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=3, n_jobs = -1)
gs.fit(data)
params = gs.best_params['mae']
svdtuned = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all'])
svdtuned.fit(trainset)
svdtuned_predictions = svdtuned.test(testset)

In [None]:
dump.dump('model/trained_models/SVDTuned_Model',algo=svdtuned,predictions=svdtuned_predictions)

In [None]:
params

In [None]:
svd_tuned_model = cf_model(svdtuned, trainset, testset, data, 20)
svd_tuned_model.fit_and_predict()

In [None]:
svd_tuned_model.cross_validate()

In [None]:
svd_tuned_model.recommend_all(sveta_user_id).merge(movie_df, on='movieId', how='inner').head(20)

In [None]:
svd_tuned_model.recommend_all(hamid_user_id).merge(movie_df, on='movieId', how='inner').head(20)

## SVD default

In [118]:
svd_preds, svd_algo = dump.load('./model/trained_models/SVDDefault_Model')

In [119]:
svd_default_model = cf_model(svd_algo, trainset, testset, data, 20, svd_preds)

In [120]:
svd_default_model.fit_and_predict()
#svd_model.cross_validate()

RMSE: 0.7241
RMSE for the predicted result is 0.724


0.724

In [8]:
sveta_ratings = pd.read_csv('data/external/sveta-ratings.csv')
hamid_ratings = pd.read_csv('data/external/hamid-ratings.csv')

## Calculation of the quality of the collaborative filtering