# Imports


In [1]:
import pandas as pd
import pickle
import sampling
from collections import defaultdict
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise.model_selection import KFold
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
import random
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV

import numpy as np
my_seed = 13
random.seed(my_seed)

# Constants

In [2]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

# Load data


In [3]:
ratings_small = pd.read_parquet(RATINGS_SMALL)

author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')
recipe_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='RecipeId')
ratings_sample = author_min_20.merge(recipe_min_20, how='inner')

# Create surprise dataset

In [4]:
ratings_sample_dataset = Dataset.load_from_df(ratings_sample[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
# authors_min_20_dataset = Dataset.load_from_df(author_min_20[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
# ratings_train_set, ratings_test_set = train_test_split(ratings_min_20_dataset, train_size=0.8)
# authors_train_set, authors_test_set = train_test_split(authors_min_20_dataset, train_size=0.8)
# all_train_set, all_test_set = train_test_split(all_ratings_dataset, train_size=0.8)
# full_trainset = user_item_ratings_dataset.build_full_trainset()
# full_testset = full_trainset.build_testset()

# SVDpp

Ocena jest wyznaczana jako:

$$\hat{r}_{ui}=\mu + b_{u} + b_{i} + q_{i}^{T}(p_{u} + |R(u)|^{-1/2}\sum y_{j})$$


## Sample

### Gridsearch

In [8]:
%%time
param_grid = {'n_epochs': [50, 150], 
              'n_factors': [10, 20], 
              'lr_all': [0.001, 0.005], 
              'reg_all': [0.02, 0.07] }

gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3, joblib_verbose=5, n_jobs=-1)

print("Fitting...")
gs.fit(ratings_sample_dataset)

# best RMSE score
print(f"Best RMSE score: {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"Best parameters: {gs.best_params['rmse']}")

results_df = pd.DataFrame.from_dict(gs.cv_results)

algoSVD_sample = gs.best_estimator['rmse']

with open("../SVDpp/sample/results_df.obj", 'wb') as pickle_file:
     pickle.dump(results_df, pickle_file)
        
with open("../SVDpp/sample/SVDpp_best_model.obj", 'wb') as pickle_file:
     pickle.dump(algoSVD_sample, pickle_file)
        
with open("../SVDpp/sample/gridsearch_object.obj", 'wb') as pickle_file:
     pickle.dump(gs, pickle_file)

Fitting...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 73.8min


Best RMSE score: 0.8810037694564796
Best parameters: {'n_epochs': 50, 'n_factors': 10, 'lr_all': 0.001, 'reg_all': 0.07}
Wall time: 11h 47min 55s


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 707.9min finished


In [2]:
with open("../SVDpp/sample/results_df.obj", 'rb') as pickle_file:
     results_df = pickle.load(pickle_file)

In [5]:
# results_df

### Kfold for sample

In [7]:
n_factors = 10
n_epochs = 50
init_mean= 0
init_std_dev = 0.1
lr_all = 0.001
reg_all = 0.08

algoSVD_min = SVDpp(n_factors=n_factors,
                  n_epochs=n_epochs, 
                  init_mean=init_mean,
                  init_std_dev = init_std_dev,
                  lr_all = lr_all,
                  reg_all = reg_all,
                    verbose=True)

j = 1
kf = KFold(n_splits=5)
rmses = []
for trainset, testset in kf.split(ratings_sample_dataset):
    algoSVD_min.fit(trainset)
    predictions = algoSVD_min.test(testset)
        
    print(f"RMSE for fold {j}: {accuracy.rmse(predictions, verbose=False)}")
    rmses.append(accuracy.rmse(predictions, verbose=False))
    j = j + 1

print(f"Mean RMSE: {np.mean(rmses)}")

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing epoch 47
 p