In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import preprocessing

In [3]:
import pandas as pd
import numpy as np

In [4]:
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
User-ID        1149780 non-null int64
ISBN           1149780 non-null object
Book-Rating    1149780 non-null int64
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [6]:
books = pd.read_csv('data/BX-Books.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [7]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 8 columns):
ISBN                   271379 non-null object
Book-Title             271379 non-null object
Book-Author            271378 non-null object
Year-Of-Publication    271379 non-null object
Publisher              271377 non-null object
Image-URL-S            271379 non-null object
Image-URL-M            271379 non-null object
Image-URL-L            271376 non-null object
dtypes: object(8)
memory usage: 16.6+ MB


In [8]:
books = books[~books['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard'])]

In [9]:
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(np.int)

In [10]:
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271376 entries, 0 to 271378
Data columns (total 8 columns):
ISBN                   271376 non-null object
Book-Title             271376 non-null object
Book-Author            271375 non-null object
Year-Of-Publication    271376 non-null int32
Publisher              271374 non-null object
Image-URL-S            271376 non-null object
Image-URL-M            271376 non-null object
Image-URL-L            271376 non-null object
dtypes: int32(1), object(7)
memory usage: 17.6+ MB


In [11]:
users = pd.read_csv('data/BX-Users.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [12]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
User-ID     278858 non-null int64
Location    278858 non-null object
Age         168096 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


# Preprocessing phase

In [13]:
data_dict = {}
data_dict['books'] = books
data_dict['users'] = users
data_dict['ratings'] = ratings

In [14]:
preprocessed_data_dict = preprocessing(data_dict=data_dict, is_explicit=True, book_ratings_count_threshold=2, user_ratings_count_threshold=2)

In [15]:
preprocessed_data_dict['ratings'].head()

Unnamed: 0,User-ID,ISBN,Book-Rating
16,276747,60517794,9
19,276747,671537458,9
20,276747,679776818,8
59,276772,553572369,7
61,276772,3499230933,10


In [16]:
preprocessed_data_dict['ratings'] = preprocessed_data_dict['ratings'].rename({'User-ID': 'userID', 'ISBN': 'itemID', 'Book-Rating': 'rating'}, axis='columns')

In [17]:
preprocessed_data_dict['ratings'].head()

Unnamed: 0,userID,itemID,rating
16,276747,60517794,9
19,276747,671537458,9
20,276747,679776818,8
59,276772,553572369,7
61,276772,3499230933,10


# Surprise experiments

In [25]:
from surprise import SVD
from surprise import NMF
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader

## Data reader

In [19]:
reader = Reader(rating_scale=(1, 5))

In [20]:
data = Dataset.load_from_df(preprocessed_data_dict['ratings'][['userID', 'itemID', 'rating']], reader)

## SVD

In [21]:
algo_svd = SVD()

In [22]:
cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2676  3.2760  3.2749  3.2629  3.2612  3.2685  0.0061  
MAE (testset)     2.8744  2.8820  2.8829  2.8683  2.8688  2.8753  0.0062  
Fit time          10.84   10.85   10.77   10.73   10.68   10.77   0.07    
Test time         0.37    0.33    0.33    0.31    0.27    0.32    0.03    


{'fit_time': (10.841763019561768,
  10.847985982894897,
  10.766520738601685,
  10.730874061584473,
  10.677535057067871),
 'test_mae': array([ 2.8743808 ,  2.88198274,  2.88292508,  2.86826567,  2.86882269]),
 'test_rmse': array([ 3.2675783 ,  3.27604999,  3.27485981,  3.26285386,  3.26115656]),
 'test_time': (0.3674752712249756,
  0.33240342140197754,
  0.33341503143310547,
  0.3052215576171875,
  0.270829439163208)}

## NMF

In [23]:
algo_nmf = NMF()

In [24]:
cross_validate(algo_nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.4031  3.4071  3.4122  3.4056  3.4170  3.4090  0.0050  
MAE (testset)     3.0339  3.0388  3.0464  3.0395  3.0521  3.0421  0.0064  
Fit time          15.35   15.08   15.15   15.16   15.17   15.18   0.09    
Test time         0.30    0.26    0.25    0.26    0.24    0.26    0.02    


{'fit_time': (15.346777439117432,
  15.077441453933716,
  15.15070128440857,
  15.15864634513855,
  15.172360181808472),
 'test_mae': array([ 3.03386027,  3.03879187,  3.04639825,  3.03951103,  3.05212681]),
 'test_rmse': array([ 3.40306955,  3.40711116,  3.41219934,  3.40560382,  3.41702137]),
 'test_time': (0.3038060665130615,
  0.255678653717041,
  0.2542262077331543,
  0.2557559013366699,
  0.2356555461883545)}

## KNN (Item-based)

In [26]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}

In [30]:
algo_knn = KNNBasic(k=5, sim_options=sim_options)

In [32]:
cross_validate(algo_knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=1)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2783  3.2742  3.2786  3.2973  3.2774  3.2812  0.0082  
MAE (testset)     2.8843  2.8779  2.8826  2.9024  2.8827  2.8859  0.0085  
Fit time          32.05   35.55   35.90   37.55   33.11   34.83   1.99    
Test time         3.26    3.17    3.50    3.11    3.15    3.24    0.14    


{'fit_time': (32.05273199081421,
  35.55475306510925,
  35.89849853515625,
  37.55265831947327,
  33.10518431663513),
 'test_mae': array([ 2.88430427,  2.87786967,  2.88256826,  2.90235005,  2.88265611]),
 'test_rmse': array([ 3.27825125,  3.27422483,  3.27863254,  3.29733388,  3.2774494 ]),
 'test_time': (3.2581968307495117,
  3.1743669509887695,
  3.495800018310547,
  3.1102428436279297,
  3.1533429622650146)}

## KNN (User-based)

In [33]:
sim_options = {
    'name': 'cosine'
}

In [34]:
algo_knn = KNNBasic(k=5, sim_options=sim_options)

In [35]:
cross_validate(algo_knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=1)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2969  3.2803  3.2810  3.2895  3.2799  3.2855  0.0067  
MAE (testset)     2.8975  2.8843  2.8859  2.8937  2.8848  2.8892  0.0054  
Fit time          12.35   12.03   10.80   10.82   10.75   11.35   0.69    
Test time         1.11    1.14    1.13    1.02    1.16    1.11    0.05    


{'fit_time': (12.348301410675049,
  12.030579566955566,
  10.796773672103882,
  10.823979616165161,
  10.75337266921997),
 'test_mae': array([ 2.89745398,  2.88429517,  2.88593928,  2.89373923,  2.88476129]),
 'test_rmse': array([ 3.29694101,  3.2802732 ,  3.28103154,  3.28947632,  3.27986773]),
 'test_time': (1.1117057800292969,
  1.1355373859405518,
  1.1337437629699707,
  1.0182163715362549,
  1.1636104583740234)}