In [1]:
# Importing relevant libraries
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np

from surprise.model_selection import cross_validate
from surprise.model_selection import RandomizedSearchCV, GridSearchCV

from surprise.prediction_algorithms import SVD, SVDpp, NMF
from surprise.prediction_algorithms import SlopeOne, CoClustering
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline

In [2]:
ratings = pd.read_csv('cleaningData/clean_rating.csv', index_col=0)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3175823 entries, 10344 to 7813240
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 96.9 MB


In [3]:
print('No. of Unique Users    :', ratings['user_id'].nunique())
print('No. of Unique Animes    :', ratings['anime_id'].nunique())
print('No. of Unique Ratings  :', ratings['rating'].nunique())

No. of Unique Users    : 71753
No. of Unique Animes    : 4059
No. of Unique Ratings  : 11


In [4]:
ratings.describe()

Unnamed: 0,user_id,anime_id,rating
count,3175823.0,3175823.0,3175823.0
mean,36804.16,8161.12,6.281125
std,21041.33,8768.075,3.67833
min,1.0,1.0,-1.0
25%,18951.0,813.0,6.0
50%,36999.0,5112.0,8.0
75%,54883.0,12189.0,9.0
max,73516.0,34519.0,10.0


In [5]:
# Transform the dataset into something compatible with surprise
from surprise import Reader, Dataset
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings,reader)

In [6]:
# Preview
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  71753 

Number of items:  4059


In [8]:
# k-NN Based Algorithm
knnbasic_cv = cross_validate(KNNBasic(), data, cv=3, n_jobs=-1, verbose=False)
knnbaseline_cv = cross_validate(KNNBaseline(), data, cv=3, n_jobs=-1, verbose=False)
knnmeans_cv = cross_validate(KNNWithMeans(), data, cv=3, n_jobs=-1, verbose=False)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
# Matrix Factorization Based Algorithms
svd_cv = cross_validate(SVD(), data, cv=3, n_jobs=5, verbose=False)
nmf_cv = cross_validate(NMF(), data, cv=3, n_jobs=5, verbose=False)
svdpp_cv = cross_validate(SVDpp(), data, cv=3, n_jobs=5, verbose=False)

In [None]:
# Other Collaborative Filtering Algorithms
slope_cv = cross_validate(SlopeOne(), data, cv=3, n_jobs=5, verbose=False)
coclus_cv = cross_validate(CoClustering(), data, cv=3, n_jobs=5, verbose=False)