# SVD - Collaborative filtering

In [43]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [44]:
import pandas as pd
import numpy as np

In [45]:
#Load the data
ratings = pd.read_csv("C:/Users/messi/OneDrive/Documents/ratings_sub.csv",encoding = "ISO-8859-1")

In [46]:
#Number of rows and columns
ratings.shape

(487469, 7)

In [48]:
ratings.userId=ratings.userId.astype(str)
ratings.movieId=ratings.movieId.astype(str)

In [49]:
#Column names
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres', 'year'], dtype='object')

In [50]:
# Total unique users 
print("total unique users - ",len(ratings["userId"].unique()))

total unique users -  2827


In [51]:
# Users with max no of movies watches
ratings["userId"].value_counts().head()

3218     200
75694    200
61382    200
47594    200
29990    200
Name: userId, dtype: int64

In [52]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

In [53]:
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [54]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# KNN

In [56]:
from surprise import KNNWithMeans
from surprise import accuracy

In [57]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson', 'user_based': False})
algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1ca3b67f610>

In [58]:
len(testset)

121868

In [59]:
testset[0:5]

[('107317', 'Signs (2002)', 2.5),
 ('103061', 'Inconvenient Truth, An (2006)', 4.5),
 ('84115', 'Battlefield Earth (2000)', 2.5),
 ('130756',
  'Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)',
  2.0),
 ('24878', 'Drive (2011)', 4.5)]

In [60]:
# Evalute on test set
test_pred = algo.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.8114


0.8113764816533418

In [61]:
# View a particular prediction
test_pred[12]

Prediction(uid='7051', iid='Black Hawk Down (2001)', r_ui=5.0, est=3.8665487014970545, details={'actual_k': 50, 'was_impossible': False})

In [62]:
test_pred[12].details["actual_k"]

50

In [63]:
# convert results to dataframe
test_pred_df = pd.DataFrame(test_pred)
test_pred_df["was_impossible"] = [x["was_impossible"] for x in test_pred_df["details"]]

In [64]:
test_pred_df.loc[test_pred_df.was_impossible].head(5)

Unnamed: 0,uid,iid,r_ui,est,details,was_impossible
159,36730,Grill Point (Halbe Treppe) (2002),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
604,131040,Escape from Planet Earth (2013),2.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
827,116349,No Good Deed (2014),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
1865,124431,Films to Keep You Awake: The Christmas Tale (P...,0.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
2020,21811,Insanitarium (2008),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True


In [85]:
# Mkae prediction for a single user
algo.predict(uid="user_405",iid="Wrong Trousers, The (1993)")

Prediction(uid='user_405', iid='Wrong Trousers, The (1993)', r_ui=None, est=3.511396303620614, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

# Generating top n recommendations

In [86]:
testset_new = trainset.build_anti_testset()

In [87]:
len(testset_new)

17308818

In [88]:
testset_new[0:5]

[('248', 'Disturbia (2007)', 3.511396303620614),
 ('248', 'Hamlet 2 (2008)', 3.511396303620614),
 ('248', 'Unbreakable (2000)', 3.511396303620614),
 ('248', 'Finding Neverland (2004)', 3.511396303620614),
 ('248', 'X2: X-Men United (2003)', 3.511396303620614)]

In [89]:
predictions = algo.test(testset_new[0:10000])

In [90]:
predictions_df = pd.DataFrame([[x.uid,x.est] for x in predictions])

In [91]:
predictions_df.columns = ["userId","est_rating"]
predictions_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [92]:
predictions_df.head()

Unnamed: 0,userId,est_rating
8040,45844,5.0
9039,45844,5.0
9147,45844,5.0
9413,45844,5.0
9497,45844,5.0


In [93]:
top_10_recos = predictions_df.groupby("userId").head(10).reset_index(drop=True)

In [94]:
top_10_recos

Unnamed: 0,userId,est_rating
0,45844,5.0
1,45844,5.0
2,45844,5.0
3,45844,5.0
4,45844,5.0
5,45844,5.0
6,45844,5.0
7,45844,5.0
8,45844,5.0
9,45844,5.0


# SVD Based Recommendation

In [35]:
# Lets exclude movies with very few ratings, say less than 5
movie_count = ratings["title"].value_counts(ascending=False)
pop_movie = movie_count.loc[movie_count.values > 200].index
len(pop_movie)

567

In [36]:
ratings = ratings.loc[ratings.title.isin(pop_movie)]
ratings.shape

(350710, 7)

In [37]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [38]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.84,random_state=123)

In [39]:
#SVD Collaborative filtering
from surprise import SVD
from surprise import accuracy

In [40]:
svd_model = SVD(n_factors=50,biased=False)
svd_model.fit(trainset)

Wall time: 1.42 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ca3b67f370>

In [41]:
test_pred = svd_model.test(testset)

In [42]:
# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.8551


0.855069335414922

In [65]:
user_factors = svd_model.pu
user_factors.shape
item_factors = svd_model.qi
item_factors.shape

(2827, 50)

(567, 50)

In [66]:
pred = np.dot(user_factors,np.transpose(item_factors))

In [67]:
pred[1523,0:5]

array([3.15071533, 2.58299381, 3.89721043, 4.04191251, 3.82454252])

In [68]:
svd_model.predict(uid = trainset.to_raw_uid(1523), iid = trainset.to_raw_iid(0))

Prediction(uid='54453', iid='Life of Pi (2012)', r_ui=None, est=3.5731381319836757, details={'was_impossible': True, 'reason': 'User and item are unknown.'})

# Parameter tuning of SVD Recommendation system

In [69]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_factors' : [5,10,15], "reg_all":[0.01,0.02]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3,refit = True)

In [70]:
gs.fit(data)

In [71]:
# get all parameter combinations
gs.param_combinations

[{'n_factors': 5, 'reg_all': 0.01},
 {'n_factors': 5, 'reg_all': 0.02},
 {'n_factors': 10, 'reg_all': 0.01},
 {'n_factors': 10, 'reg_all': 0.02},
 {'n_factors': 15, 'reg_all': 0.01},
 {'n_factors': 15, 'reg_all': 0.02}]

In [72]:
# get best parameters
gs.best_params

{'rmse': {'n_factors': 15, 'reg_all': 0.02}}

In [83]:
test_pred_best = gs.test(testset)

In [84]:
# compute RMSE
accuracy.rmse(test_pred_best)

RMSE: 0.7426


0.7426129216395535