In [1]:
import pandas as pd
import numpy as np



In [2]:
movies = pd.read_csv('./data/movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
links = pd.read_csv('./data/links.csv')
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [4]:
ratings = pd.read_csv('./data/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
tags = pd.read_csv('./data/tags.csv')
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


https://realpython.com/build-recommendation-engine-collaborative-filtering/


How do you determine which users or items are similar to one another?

Given that you know which users are similar, how do you determine the rating that a user would give to an item based on the ratings of similar users?

How do you measure the accuracy of the ratings you calculate?


In [12]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
import os


In [18]:
# path to dataset file
file_path = os.path.expanduser('./data/ratings.csv')

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)


In [19]:
data = Dataset.load_from_file(file_path, reader=reader)

In [20]:
data

<surprise.dataset.DatasetAutoFolds at 0x7fef23d5ec88>

In [22]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)

In [25]:
from surprise import SVD
# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)


In [26]:
from surprise import accuracy
accuracy.rmse(predictions)

RMSE: 0.8728


0.8728301981423847

In [28]:
from surprise.model_selection import GridSearchCV
param_grid = {
    'n_epochs': [5, 10], 
    'lr_all': [0.002, 0.005],
    'reg_all': [0.4, 0.6]
}

In [29]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


0.8933911996106166
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [30]:
from surprise.prediction_algorithms.slope_one import SlopeOne

In [32]:
so = SlopeOne()
so.fit(trainset)
predictions = so.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9006


0.9006244619941541

In [33]:
from surprise.prediction_algorithms.co_clustering import CoClustering
so = CoClustering()
so.fit(trainset)
predictions = so.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9372


0.9371715786127287