# Imports

In [42]:
import pandas as pd
import pickle
import sampling
from collections import defaultdict
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise.model_selection import KFold
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
import random
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV
import os
import numpy as np

my_seed = 13
random.seed(my_seed)

# Constants

In [3]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

# Load data


In [4]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

In [5]:
author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')
recipe_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='RecipeId')

ratings_min_20 = author_min_20.merge(recipe_min_20, how='inner')

In [6]:
author_min_5 = sampling.get_rating_with_min_number(ratings_small, 5, col_name='AuthorId')
author_min_40 = sampling.get_rating_with_min_number(ratings_small, 40, col_name='AuthorId')

In [32]:
author_min_0 = sampling.get_rating_with_min_number(ratings_small, 0, col_name='AuthorId')

# Create surprise dataset

In [9]:
ratings_min_20_dataset = Dataset.load_from_df(ratings_min_20[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_20_dataset = Dataset.load_from_df(author_min_20[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
all_ratings_dataset = Dataset.load_from_df(ratings_small[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))

In [10]:
authors_min_5_dataset = Dataset.load_from_df(author_min_5[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_40_dataset = Dataset.load_from_df(author_min_40[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))

In [13]:
ratings_train_set, ratings_test_set = train_test_split(ratings_min_20_dataset, train_size=0.8)
authors_train_set, authors_test_set = train_test_split(authors_min_20_dataset, train_size=0.8)
all_train_set, all_test_set = train_test_split(all_ratings_dataset, train_size=0.8)
# full_trainset = user_item_ratings_dataset.build_full_trainset()
# full_testset = full_trainset.build_testset()

# SVD algorithm

Ocena jest wyznaczana jako:

$$\hat{r}_{ui}=\mu + b_{u} + b_{i} + q_{i}^{T}p_{u}$$

$\mu$ - średnia ze wszystkich ocen

$b_{u}$ - średnia ocen użytkownika $u$

$b_{i}$ - średnia ocen przepisu $i$

Algorytm wykorzystuje SGD do minimalizacji zregularyzowanego błędu kwadratowego:

$$\sum (r_{ui}-\hat{r}_{ui})^2 + \lambda(b_{i}^2+b_{u}^2 + ||q_{i}||^2 + ||p_{i}||^2)$$

PARAMETRY:
$n\_factors$ - liczba współczynników w rozłożonej macierzy

$n\_epochs$ - liczba iteracji SGD

$init\_mean$ - średnia rozkładu normalnego wykorzystywanego do inicjalizacji

$inid\_std\_dev$ - odchylenie standardowe rozkładu normalnego wykorzystywanego do inicjalizacji

$lr\_all$ - współczynnik uczenia wszystkich parametrów

$reg\_all$ - współćzynnik regularyzacji dla wszystkich parametrów

$lr\_bu$ - współczynnik uczenia $b_{u}$

$lr\_bi$ - współczynnik uczenia $b_{i}$

$lr\_pu$ - współczynnik uczenia $p_{u}$

$lr\_qi$ - współczynnik uczenia $q_{i}$

$reg\_bu$ - współczynnik regularyzacji $b_{u}$

$reg\_bi$ - współczynnik regularyzacji $b_{i}$

$reg\_pu$ - współczynnik regularyzacji $p_{u}$

$reg\_qi$ - współczynnik regularyzacji $q_{i}$

In [14]:
rmse_per_num_of_min = {}

## All ratings

### Grid search

Searching for best parameters of SVD algorithm for all ratings in the dataset. 

In [18]:
%%time
param_grid = {'n_epochs': [50, 100, 150], 
              'n_factors': [10, 20], 
              'lr_all': [0.001,  0.002], 
              'reg_all': [0.05, 0.08, 0.1] }

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, joblib_verbose=3, n_jobs=-1)

print("Fitting...")
gs.fit(all_ratings_dataset)

# best RMSE score
print(f"Best RMSE score: {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"Best parameters: {gs.best_params['rmse']}")

results_df = pd.DataFrame.from_dict(gs.cv_results)

algoSVD_all_best = gs.best_estimator['rmse']

with open("../SVD/all_ratings/results_df.obj", 'wb') as pickle_file:
     pickle.dump(results_df, pickle_file)
        
with open("../SVD/all_ratings/SVD_best_model.obj", 'wb') as pickle_file:
     pickle.dump(algoSVD_all_best, pickle_file)
        
with open("../SVD/all_ratings/gridsearch_object.obj", 'wb') as pickle_file:
     pickle.dump(gs, pickle_file)

Fitting...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 17.1min


Best RMSE score: 1.2210389820646907
Best parameters: {'n_epochs': 100, 'n_factors': 10, 'lr_all': 0.001, 'reg_all': 0.1}
Wall time: 2h 22min 42s


[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 142.3min finished


## Each user - min 5 ratings

### Grid search
Searching for best parameters for dataset with min 5 ratings per user

In [21]:
%%time
param_grid = {'n_epochs': [50, 100, 150], 
              'n_factors': [10, 15, 20], 
              'lr_all': [0.001, 0.005], 
              'reg_all': [0.02, 0.05, 0.08] }

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, joblib_verbose=3, n_jobs=-1)

print("Fitting...")
gs.fit(authors_min_5_dataset)

# best RMSE score
print(f"Best RMSE score: {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"Best parameters: {gs.best_params['rmse']}")

results_df = pd.DataFrame.from_dict(gs.cv_results)

algoSVD_author_min_5 = gs.best_estimator['rmse']

with open("../SVD/author_min_5/results_df.obj", 'wb') as pickle_file:
     pickle.dump(results_df, pickle_file)
        
with open("../SVD/author_min_5/SVD_best_model.obj", 'wb') as pickle_file:
     pickle.dump(algoSVD_author_min_5, pickle_file)
        
with open("../SVD/author_min_5/gridsearch_object.obj", 'wb') as pickle_file:
     pickle.dump(gs, pickle_file)

Fitting...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 86.4min


Best RMSE score: 0.9351586063179143
Best parameters: {'n_epochs': 100, 'n_factors': 10, 'lr_all': 0.001, 'reg_all': 0.08}
Wall time: 5h 17min 13s


[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 316.8min finished


## Each user - min 20 ratings

### Grid search

Searching for best parameters of SVD algorithms based on dataset where each user has at least 20 ratings

In [19]:
param_grid = {'n_epochs': [50, 100, 150], 
              'n_factors': [10, 15, 20], 
              'lr_all': [0.001, 0.005], 
              'reg_all': [0.02, 0.05, 0.07] }

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, joblib_verbose=3, n_jobs=-1)

print("Fitting...")
gs.fit(authors_min_20_dataset)

# best RMSE score
print(f"Best RMSE score: {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"Best parameters: {gs.best_params['rmse']}")

results_df = pd.DataFrame.from_dict(gs.cv_results)

algoSVD_author_min_20 = gs.best_estimator['rmse']

with open("../SVD/author_min_20/results_df.obj", 'wb') as pickle_file:
     pickle.dump(results_df, pickle_file)
        
with open("../SVD/author_min_20/SVD_best_model.obj", 'wb') as pickle_file:
     pickle.dump(algoSVD_author_min_20, pickle_file)
        
with open("../SVD/author_min_20/gridsearch_object.obj", 'wb') as pickle_file:
     pickle.dump(gs, pickle_file)

Fitting...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 71.8min


Best RMSE score: 0.8773034194006073
Best parameters: {'n_epochs': 50, 'n_factors': 10, 'lr_all': 0.001, 'reg_all': 0.07}


[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 260.8min finished


#### Check reg_all

In [20]:
%%time
param_grid = {'n_epochs': [50], 
              'n_factors': [10], 
              'lr_all': [0.001], 
              'reg_all': [0.07, 0.08, 0.1] }

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, joblib_verbose=3, n_jobs=-1)

print("Fitting...")
gs.fit(authors_min_20_dataset)

# best RMSE score
print(f"Best RMSE score: {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"Best parameters: {gs.best_params['rmse']}")

results_df = pd.DataFrame.from_dict(gs.cv_results)

algoSVD_author_min_20 = gs.best_estimator['rmse']

with open("../SVD/author_min_20/results_df_reg.obj", 'wb') as pickle_file:
     pickle.dump(results_df, pickle_file)
        
with open("../SVD/author_min_20/SVD_best_model_reg.obj", 'wb') as pickle_file:
     pickle.dump(algoSVD_author_min_20, pickle_file)
        
with open("../SVD/author_min_20/gridsearch_object_reg.obj", 'wb') as pickle_file:
     pickle.dump(gs, pickle_file)

Fitting...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best RMSE score: 0.8771325518694285
Best parameters: {'n_epochs': 50, 'n_factors': 10, 'lr_all': 0.001, 'reg_all': 0.08}
Wall time: 7min 16s


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  6.9min finished


## Each user - min 40 ratings

### Gridsearch

In [22]:
param_grid = {'n_epochs': [50, 100, 150], 
              'n_factors': [10, 15, 20], 
              'lr_all': [0.001, 0.005], 
              'reg_all': [0.02, 0.05, 0.07] }

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, joblib_verbose=3, n_jobs=-1)

print("Fitting...")
gs.fit(authors_min_40_dataset)

# best RMSE score
print(f"Best RMSE score: {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"Best parameters: {gs.best_params['rmse']}")

results_df = pd.DataFrame.from_dict(gs.cv_results)

algoSVD_author_min_40 = gs.best_estimator['rmse']

with open("../SVD/author_min_40/results_df.obj", 'wb') as pickle_file:
     pickle.dump(results_df, pickle_file)
        
with open("../SVD/author_min_40/SVD_best_model.obj", 'wb') as pickle_file:
     pickle.dump(algoSVD_author_min_40, pickle_file)
        
with open("../SVD/author_min_40/gridsearch_object.obj", 'wb') as pickle_file:
     pickle.dump(gs, pickle_file)

Fitting...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 66.1min


Best RMSE score: 0.847676725842636
Best parameters: {'n_epochs': 50, 'n_factors': 10, 'lr_all': 0.001, 'reg_all': 0.07}


[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 240.7min finished


## RMSE based on number of min ratings

In [43]:
%%time
n_factors = 10
n_epochs = 50
init_mean= 0
init_std_dev = 0.1
lr_all = 0.001
reg_all = 0.08

rmse_per_num_of_min = {}

for i in list(range(0, 35,5)) +list(range(40, 110, 10)):
    
    print(f"Creating dataset with min rating {i}")
    author_min = sampling.get_rating_with_min_number(ratings_small, i, col_name='AuthorId')    
    authors_min_dataset = Dataset.load_from_df(author_min[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5))) 

    print(f"Initialize algorithm")
    algoSVD_min = SVD(n_factors=n_factors,
                  n_epochs=n_epochs, 
                  init_mean=init_mean,
                  init_std_dev = init_std_dev,
                  lr_all = lr_all,
                  reg_all = reg_all)

    print(f"Create kfold split...")
    j = 1
    kf = KFold(n_splits=5)
    rmses = []
    for trainset, testset in kf.split(authors_min_dataset):
        algoSVD_min.fit(trainset)
        predictions = algoSVD_min.test(testset)
        
        print(f"RMSE for fold {j}: {accuracy.rmse(predictions, verbose=False)}")
        rmses.append(accuracy.rmse(predictions, verbose=False))
        j = j + 1

    rmse_per_num_of_min[i] = np.mean(rmses)
    print(f"Mean RMSE: {np.mean(rmses)}")
    print()

Creating dataset with min rating 0
Initialize algorithm
Create kfold split...
RMSE for fold 1: 1.2202385085617704
RMSE for fold 2: 1.2222583814580281
RMSE for fold 3: 1.2230983388375285
RMSE for fold 4: 1.2163192751550642
RMSE for fold 5: 1.2179900549850615
Mean RMSE: 1.2199809117994904

Creating dataset with min rating 5
Initialize algorithm
Create kfold split...
RMSE for fold 1: 0.9322900631911505
RMSE for fold 2: 0.9357032087028943
RMSE for fold 3: 0.9406698147765497
RMSE for fold 4: 0.93754544791208
RMSE for fold 5: 0.9357880707786516
Mean RMSE: 0.9363993210722652

Creating dataset with min rating 10
Initialize algorithm
Create kfold split...
RMSE for fold 1: 0.9020364899216541
RMSE for fold 2: 0.9040446345035779
RMSE for fold 3: 0.904385010346579
RMSE for fold 4: 0.9119118394381012
RMSE for fold 5: 0.9045982436158848
Mean RMSE: 0.9053952435651593

Creating dataset with min rating 15
Initialize algorithm
Create kfold split...
RMSE for fold 1: 0.8945829500687447
RMSE for fold 2: 0.8

In [44]:
rmse_per_num_of_min

{0: 1.2199809117994904,
 5: 0.9363993210722652,
 10: 0.9053952435651593,
 15: 0.8892767502913882,
 20: 0.8773265269933377,
 25: 0.868525506528759,
 30: 0.8608010610721755,
 40: 0.8475672961560793,
 50: 0.8361515899157878,
 60: 0.8293034396363781,
 70: 0.8229779407826321,
 80: 0.8176107851803873,
 90: 0.8129851517876064,
 100: 0.8063038340858226}

In [45]:
with open('../SVD/rmse_per_num_of_min.obj', 'wb') as pickle_file:
     pickle.dump(rmse_per_num_of_min, pickle_file)