# Matrix factorization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from scipy.sparse.linalg import svds

from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy

DATA_PATH = '../data/ml-100k-convert/'
GENERATED_PATH = '../generated/'
RESULT_PATH = '../results/'

## Data preparation

In [2]:
# Load data
ratings = pd.read_csv(DATA_PATH+'data.tsv', sep='\t', names=['UserId', 'MovieId', 'Ratings', 'Timestamp'])
print(ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,UserId,MovieId,Ratings,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
# Load the movie id
movies = pd.read_csv(GENERATED_PATH+'final_movies.csv')
movies_id = movies['ML-100k-convertId'].to_list()
# Keep only ratings from final movies
ratings = ratings[ratings.MovieId.isin(movies_id)]
print(ratings.shape)
ratings.head()

(59218, 4)


Unnamed: 0,UserId,MovieId,Ratings,Timestamp
1,186,302,3,891717742
3,244,51,2,880606923
4,166,346,1,886397596
6,115,265,2,881171488
8,305,451,3,886324817


In [4]:
ratings = ratings.drop(columns=['Timestamp']).reset_index(drop=True)
ratings.head()

Unnamed: 0,UserId,MovieId,Ratings
0,186,302,3
1,244,51,2
2,166,346,1
3,115,265,2
4,305,451,3


In [5]:
cut = int(0.8*len(ratings))
train_df = ratings.loc[:cut]
test_df = ratings.loc[cut+1:]

Load data into `Surprise` dataset

In [6]:
trainset = Dataset.load_from_df(train_df, Reader())
testset = Dataset.load_from_df(test_df, Reader())

In [7]:
_ , testset = train_test_split(testset, test_size=0.99999)

## Model Selection

We will test the SVD model. We will perform a cross validation to find the appropriate number of factors and then we will test it on the testset.

In [8]:
param_grid = {'n_factors': range(1,21)}

### SVD

In [9]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(trainset)

algo = SVD(n_factors=gs.best_params['rmse']['n_factors'])
algo.fit(trainset.build_full_trainset())
predictions = algo.test(testset)

print("Number of factors : ", gs.best_params['rmse']['n_factors'])
print("Validation RMSE : ", gs.best_score['rmse'])
print("Test RMSE : ", accuracy.rmse(predictions, verbose=False))

Number of factors :  13
Validation RMSE :  0.949126628544378
Test RMSE :  0.9301446457761109


We can see that the best number of factor is 13. We got a RMSE on the test set of 0.93.