In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import preprocessing

In [3]:
import pandas as pd
import numpy as np

In [4]:
from surprise import SVD
from surprise import NMF
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader

In [5]:
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
User-ID        1149780 non-null int64
ISBN           1149780 non-null object
Book-Rating    1149780 non-null int64
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [7]:
books = pd.read_csv('data/BX-Books.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [8]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 8 columns):
ISBN                   271379 non-null object
Book-Title             271379 non-null object
Book-Author            271378 non-null object
Year-Of-Publication    271379 non-null object
Publisher              271377 non-null object
Image-URL-S            271379 non-null object
Image-URL-M            271379 non-null object
Image-URL-L            271376 non-null object
dtypes: object(8)
memory usage: 16.6+ MB


In [10]:
books = books[~books['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard'])]

In [11]:
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(np.int)

In [12]:
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271376 entries, 0 to 271378
Data columns (total 8 columns):
ISBN                   271376 non-null object
Book-Title             271376 non-null object
Book-Author            271375 non-null object
Year-Of-Publication    271376 non-null int32
Publisher              271374 non-null object
Image-URL-S            271376 non-null object
Image-URL-M            271376 non-null object
Image-URL-L            271376 non-null object
dtypes: int32(1), object(7)
memory usage: 17.6+ MB


In [13]:
users = pd.read_csv('data/BX-Users.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [14]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
User-ID     278858 non-null int64
Location    278858 non-null object
Age         168096 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


# Preprocessing phase

In [15]:
data_dict = {}
data_dict['books'] = books
data_dict['users'] = users
data_dict['ratings'] = ratings

In [16]:
preprocessed_data_dict = preprocessing(data_dict=data_dict, is_explicit=True, book_ratings_count_threshold=2, user_ratings_count_threshold=2)

Before pre-processing: Unique users count: 105283; unique items count: 340556
After pre-processing: Unique users count: 19297; unique items count: 24016


In [17]:
preprocessed_data_dict['ratings'].head()

Unnamed: 0,User-ID,ISBN,Book-Rating
16,276747,60517794,9
19,276747,671537458,9
20,276747,679776818,8
59,276772,553572369,7
61,276772,3499230933,10


In [18]:
preprocessed_data_dict['ratings'] = preprocessed_data_dict['ratings'].rename({'User-ID': 'userID', 'ISBN': 'itemID', 'Book-Rating': 'rating'}, axis='columns')

In [19]:
preprocessed_data_dict['ratings'].head()

Unnamed: 0,userID,itemID,rating
16,276747,60517794,9
19,276747,671537458,9
20,276747,679776818,8
59,276772,553572369,7
61,276772,3499230933,10


# Surprise experiments

## Data reader

In [20]:
reader = Reader(rating_scale=(1, 5))

In [21]:
data = Dataset.load_from_df(preprocessed_data_dict['ratings'][['userID', 'itemID', 'rating']], reader)

## SVD

In [None]:
algo_svd = SVD()

In [None]:
cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=-1)

## NMF

In [None]:
algo_nmf = NMF()

In [None]:
cross_validate(algo_nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=-1)

## KNN (Item-based)

In [None]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}

In [None]:
algo_knn = KNNBasic(k=5, sim_options=sim_options)

In [None]:
cross_validate(algo_knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=1)

## KNN (User-based)

In [None]:
sim_options = {
    'name': 'cosine'
}

In [None]:
algo_knn = KNNBasic(k=5, sim_options=sim_options)

In [None]:
cross_validate(algo_knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=1)