# Импорт необходимых модулей

In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import preprocessing

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from surprise import SVD
from surprise import NMF
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader

# Загрузка данных

In [5]:
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [6]:
books = pd.read_csv('data/BX-Books.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [7]:
users = pd.read_csv('data/BX-Users.csv', sep=';', header=0, error_bad_lines=False, encoding='Windows-1251', low_memory=False)

In [8]:
data_dict = {}
data_dict['books'] = books
data_dict['users'] = users
data_dict['ratings'] = ratings

# Предварительная обработка

In [9]:
preprocessed_data_dict = preprocessing(data_dict=data_dict, is_explicit=True, book_ratings_count_threshold=2, user_ratings_count_threshold=2)

In [10]:
preprocessed_data_dict['ratings'] = preprocessed_data_dict['ratings'].rename({'User-ID': 'userID', 'ISBN': 'itemID', 'Book-Rating': 'rating'}, axis='columns')

In [11]:
reader = Reader(rating_scale=(1, 10))

In [12]:
data = Dataset.load_from_df(preprocessed_data_dict['ratings'][['userID', 'itemID', 'rating']], reader)

# Тестирование различных алгоритмов на кросс-валидации

## Метрики - RMSE, MAE

### SVD

In [13]:
algo_svd = SVD()

In [14]:
svd_cv = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5985  1.6061  1.6035  1.5973  1.5892  1.5989  0.0058  
MAE (testset)     1.2355  1.2392  1.2366  1.2346  1.2267  1.2346  0.0042  
Fit time          14.15   14.27   14.19   13.68   13.59   13.98   0.28    
Test time         0.45    0.44    0.45    0.45    0.39    0.44    0.02    


### NMF

In [15]:
algo_nmf = NMF()

In [16]:
nmf_cv = cross_validate(algo_nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.6339  2.6206  2.6209  2.6265  2.6337  2.6271  0.0058  
MAE (testset)     2.2489  2.2347  2.2364  2.2455  2.2504  2.2432  0.0065  
Fit time          15.99   15.77   15.61   15.44   15.23   15.61   0.26    
Test time         0.29    0.29    0.27    0.27    0.23    0.27    0.02    


### KNN (item-based)

In [17]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}
algo_knn = KNNBasic(k=5, sim_options=sim_options)

In [19]:
knn_item_based_cv = cross_validate(algo_knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=1)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8036  1.7975  1.8059  1.7988  1.8031  1.8018  0.0031  
MAE (testset)     1.3501  1.3395  1.3450  1.3412  1.3492  1.3450  0.0042  
Fit time          34.29   39.59   38.07   35.12   31.55   35.72   2.84    
Test time         3.20    3.08    3.01    4.00    2.95    3.25    0.39    


### KNN (user-based)

In [20]:
sim_options = {
    'name': 'cosine',
    'user_based': True
}
algo_knn = KNNBasic(k=5, sim_options=sim_options)

In [21]:
knn_user_based_cv = cross_validate(algo_knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=1)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9944  1.9846  1.9953  1.9904  2.0057  1.9941  0.0069  
MAE (testset)     1.5489  1.5399  1.5502  1.5447  1.5556  1.5479  0.0053  
Fit time          9.81    11.03   14.94   13.50   14.37   12.73   1.98    
Test time         0.95    1.37    1.17    1.21    1.52    1.24    0.19    


## Результаты тестирования