In [1]:
# Importing relevant libraries
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np

from surprise import accuracy

from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import RandomizedSearchCV, GridSearchCV

from surprise.prediction_algorithms import SVD, SVDpp, NMF
from surprise.prediction_algorithms import SlopeOne, CoClustering
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline

In [2]:
ratings = pd.read_csv('cleaningData/clean_rating.csv', index_col=0)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3175823 entries, 10344 to 7813240
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 96.9 MB


In [3]:
print('No. of Unique Users    :', ratings['user_id'].nunique())
print('No. of Unique Animes    :', ratings['anime_id'].nunique())
print('No. of Unique Ratings  :', ratings['rating'].nunique())

No. of Unique Users    : 71753
No. of Unique Animes    : 4059
No. of Unique Ratings  : 11


In [4]:
ratings.describe()

Unnamed: 0,user_id,anime_id,rating
count,3175823.0,3175823.0,3175823.0
mean,36804.16,8161.12,6.281125
std,21041.33,8768.075,3.67833
min,1.0,1.0,-1.0
25%,18951.0,813.0,6.0
50%,36999.0,5112.0,8.0
75%,54883.0,12189.0,9.0
max,73516.0,34519.0,10.0


In [5]:
# Transform the dataset into something compatible with surprise
from surprise import Reader, Dataset
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings,reader)

In [6]:
# Preview
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  71753 

Number of items:  4059


In [7]:
trainset, testset = train_test_split(data, test_size=.97)

In [8]:
algorithm_list = [KNNBasic(), KNNBaseline(), KNNWithMeans(), SVD(), SVDpp(),
                 NMF(), SlopeOne(), CoClustering()]
rmse_list = []
mae_list = []

In [9]:
# For all the algorithms
for algorithm in algorithm_list:    
    # Define algorithm.
    algo = algorithm

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and append RMSE
    rmse_list.append(accuracy.rmse(predictions))

    # Compute and append MAE
    mae_list.append(accuracy.mae(predictions))
    
    # Del algorithm for more RAM
    del algo

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.7136
MAE:  2.6724
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.4129
MAE:  2.4743
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.9534
MAE:  2.1382
RMSE: 3.2764
MAE:  2.4705
RMSE: 3.3605
MAE:  2.5473
RMSE: 5.7344
MAE:  5.0916
RMSE: 3.0183
MAE:  2.2086
RMSE: 2.8542
MAE:  2.0404


In [14]:
algorithm_str = ['KNNBasic', 'KNNBaseline', 'KNNWithMeans', 'SVD', 'SVDpp',
                 'NMF', 'SlopeOne', 'CoClustering']
df_algorithm = pd.DataFrame(
    {'algorithm': algorithm_str, 'RMSE':rmse_list, 'MAE': mae_list})

In [15]:
df_algorithm

Unnamed: 0,algorithm,RMSE,MAE
0,KNNBasic,3.713634,2.672429
1,KNNBaseline,3.412892,2.474254
2,KNNWithMeans,2.953377,2.13822
3,SVD,3.276355,2.470511
4,SVDpp,3.360475,2.547291
5,NMF,5.734411,5.091614
6,SlopeOne,3.018283,2.208561
7,CoClustering,2.854187,2.040356


In [17]:
df_algorithm.to_csv(
    "/Users/jrchen/flatiron-ds-course/Milestones/capstone/cleaningData/algorithm.csv")