In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import preprocessing

In [3]:
import pandas as pd
import numpy as np

In [4]:
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
User-ID        1149780 non-null int64
ISBN           1149780 non-null object
Book-Rating    1149780 non-null int64
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [6]:
books = pd.read_csv('data/BX-Books.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [7]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 8 columns):
ISBN                   271379 non-null object
Book-Title             271379 non-null object
Book-Author            271378 non-null object
Year-Of-Publication    271379 non-null object
Publisher              271377 non-null object
Image-URL-S            271379 non-null object
Image-URL-M            271379 non-null object
Image-URL-L            271376 non-null object
dtypes: object(8)
memory usage: 16.6+ MB


In [8]:
books = books[~books['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard'])]

In [9]:
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(np.int)

In [10]:
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271376 entries, 0 to 271378
Data columns (total 8 columns):
ISBN                   271376 non-null object
Book-Title             271376 non-null object
Book-Author            271375 non-null object
Year-Of-Publication    271376 non-null int32
Publisher              271374 non-null object
Image-URL-S            271376 non-null object
Image-URL-M            271376 non-null object
Image-URL-L            271376 non-null object
dtypes: int32(1), object(7)
memory usage: 17.6+ MB


In [11]:
users = pd.read_csv('data/BX-Users.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [12]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
User-ID     278858 non-null int64
Location    278858 non-null object
Age         168096 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


# Preprocessing phase

In [13]:
data_dict = {}
data_dict['books'] = books
data_dict['users'] = users
data_dict['ratings'] = ratings

In [14]:
preprocessed_data_dict = preprocessing(data_dict=data_dict, is_explicit=False, book_ratings_count_threshold=50, user_ratings_count_threshold=50)

In [15]:
preprocessed_data_dict['ratings'].head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [19]:
preprocessed_data_dict['ratings'] = preprocessed_data_dict['ratings'].rename({'User-ID': 'userID', 'ISBN': 'itemID', 'Book-Rating': 'rating'}, axis='columns')

In [20]:
preprocessed_data_dict['ratings'].head()

Unnamed: 0,userID,itemID,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


# Surprise experiments

In [28]:
from surprise import SVD
from surprise import NMF
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader

In [22]:
algo = SVD()

In [24]:
reader = Reader(rating_scale=(1, 5))

In [25]:
data = Dataset.load_from_df(preprocessed_data_dict['ratings'][['userID', 'itemID', 'rating']], reader)

In [27]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.5045  3.5044  3.5083  3.4978  3.5101  3.5050  0.0042  
MAE (testset)     2.9730  2.9759  2.9791  2.9671  2.9798  2.9750  0.0046  
Fit time          56.96   57.65   58.92   57.70   56.88   57.62   0.73    
Test time         1.85    1.73    1.67    1.53    1.46    1.65    0.14    


{'fit_time': (56.961883783340454,
  57.65003442764282,
  58.91723442077637,
  57.69597053527832,
  56.88344097137451),
 'test_mae': array([ 2.9730223 ,  2.97588738,  2.9791195 ,  2.96710088,  2.97981789]),
 'test_rmse': array([ 3.50453291,  3.50443604,  3.50830918,  3.49776638,  3.51007749]),
 'test_time': (1.8504199981689453,
  1.7311375141143799,
  1.673949956893921,
  1.5335171222686768,
  1.458726406097412)}

In [None]:
algo1 = NMF()