# Imports

In [2]:
import pandas as pd
import pickle
import sampling
from collections import defaultdict
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise.model_selection import KFold
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
import random
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV
my_seed = 13
random.seed(my_seed)

# Constants

In [3]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

# Load data


In [4]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

In [5]:
author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')
recipe_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='RecipeId')

ratings_min_20 = author_min_20.merge(recipe_min_20, how='inner')

In [42]:
author_min_5 = sampling.get_rating_with_min_number(ratings_small, 5, col_name='AuthorId')
author_min_10 = sampling.get_rating_with_min_number(ratings_small, 10, col_name='AuthorId')
author_min_15 = sampling.get_rating_with_min_number(ratings_small, 15, col_name='AuthorId')
author_min_25 = sampling.get_rating_with_min_number(ratings_small, 25, col_name='AuthorId')
author_min_30 = sampling.get_rating_with_min_number(ratings_small, 30, col_name='AuthorId')

In [55]:
author_min_40 = sampling.get_rating_with_min_number(ratings_small, 40, col_name='AuthorId')

In [61]:
author_min_50 = sampling.get_rating_with_min_number(ratings_small, 50, col_name='AuthorId')

# Create surprise dataset

In [6]:
ratings_min_20_dataset = Dataset.load_from_df(ratings_min_20[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_20_dataset = Dataset.load_from_df(author_min_20[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
all_ratings_dataset = Dataset.load_from_df(ratings_small[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))

In [48]:
authors_min_5_dataset = Dataset.load_from_df(author_min_5[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_10_dataset = Dataset.load_from_df(author_min_10[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_15_dataset = Dataset.load_from_df(author_min_15[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_25_dataset = Dataset.load_from_df(author_min_25[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_30_dataset = Dataset.load_from_df(author_min_30[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))


In [58]:
authors_min_40_dataset = Dataset.load_from_df(author_min_40[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))

In [62]:
authors_min_50_dataset = Dataset.load_from_df(author_min_50[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))

In [7]:
ratings_train_set, ratings_test_set = train_test_split(ratings_min_20_dataset, train_size=0.8)
authors_train_set, authors_test_set = train_test_split(authors_min_20_dataset, train_size=0.8)
all_train_set, all_test_set = train_test_split(all_ratings_dataset, train_size=0.8)
# full_trainset = user_item_ratings_dataset.build_full_trainset()
# full_testset = full_trainset.build_testset()

# SVD algorithm

Ocena jest wyznaczana jako:

$$\hat{r}_{ui}=\mu + b_{u} + b_{i} + q_{i}^{T}p_{u}$$

$\mu$ - średnia ze wszystkich ocen

$b_{u}$ - średnia ocen użytkownika $u$

$b_{i}$ - średnia ocen przepisu $i$

Algorytm wykorzystuje SGD do minimalizacji zregularyzowanego błędu kwadratowego:

$$\sum (r_{ui}-\hat{r}_{ui})^2 + \lambda(b_{i}^2+b_{u}^2 + ||q_{i}||^2 + ||p_{i}||^2)$$

PARAMETRY:
$n\_factors$ - liczba współczynników w rozłożonej macierzy

$n\_epochs$ - liczba iteracji SGD

$init\_mean$ - średnia rozkładu normalnego wykorzystywanego do inicjalizacji

$inid\_std\_dev$ - odchylenie standardowe rozkładu normalnego wykorzystywanego do inicjalizacji

$lr\_all$ - współczynnik uczenia wszystkich parametrów

$reg\_all$ - współćzynnik regularyzacji dla wszystkich parametrów

$lr\_bu$ - współczynnik uczenia $b_{u}$

$lr\_bi$ - współczynnik uczenia $b_{i}$

$lr\_pu$ - współczynnik uczenia $p_{u}$

$lr\_qi$ - współczynnik uczenia $q_{i}$

$reg\_bu$ - współczynnik regularyzacji $b_{u}$

$reg\_bi$ - współczynnik regularyzacji $b_{i}$

$reg\_pu$ - współczynnik regularyzacji $p_{u}$

$reg\_qi$ - współczynnik regularyzacji $q_{i}$

In [68]:
rmse_per_num_of_min = {}

## All ratings

In [64]:
param_grid = {'n_epochs': [50, 100, 150], 
              'n_factors': [10, 20], 
              'lr_all': [0.0005, 0.001], 
              'reg_all': [0.05, 0.08] }
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(all_ratings_dataset)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
results_df = pd.DataFrame.from_dict(gs.cv_results)

algoSVD_all_best = gs.best_estimator['rmse']
# algoSVD.fit(authors_train_set)

# predictions = algoSVD.test(authors_test_set)

1.2208013623900709
{'n_epochs': 100, 'n_factors': 10, 'lr_all': 0.001, 'reg_all': 0.08}


In [65]:
n_factors = 10
n_epochs = 100
init_mean= 0
init_std_dev = 0.1
lr_all = 0.001
reg_all = 0.08
algoSVD_all = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

In [66]:
kf = KFold(n_splits=5)
rmses = []
for trainset, testset in kf.split(all_ratings_dataset):

    # train and test algorithm.
    algoSVD_all.fit(trainset)
    predictions = algoSVD_all.test(testset)

    # Compute and print Root Mean Squared Error
    rmses.append(accuracy.rmse(predictions, verbose=True))

RMSE: 1.2160
RMSE: 1.2188
RMSE: 1.2178
RMSE: 1.2187
RMSE: 1.2172


In [69]:
rmse_per_num_of_min[0] = np.mean(rmses)

## Ratings author min 20

In [12]:
param_grid = {'n_epochs': [50, 100, 150], 'n_factors': [10, 15, 20], 'lr_all': [0.001, 0.005, 0.0007], 'reg_all': [0.01, 0.02, 0.05] }
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
gs.fit(authors_min_20_dataset)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
results_df = pd.DataFrame.from_dict(gs.cv_results)

algoSVD = gs.best_estimator['rmse']
algoSVD.fit(authors_train_set)

predictions = algoSVD.test(authors_test_set)

0.8773073112350238
{'n_epochs': 50, 'n_factors': 10, 'lr_all': 0.001, 'reg_all': 0.05}


In [26]:
n_factors = 10
n_epochs = 150
init_mean= 0
init_std_dev = 0.1
lr_all = 0.0005
reg_all = 0.07
algoSVD = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

In [35]:
algoSVD.fit(authors_train_set)

predictions = algoSVD.test(authors_test_set)

In [32]:
kf = KFold(n_splits=5)
for trainset, testset in kf.split(authors_min_20_dataset):

    # train and test algorithm.
    algoSVD.fit(trainset)
    predictions = algoSVD.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8806
RMSE: 0.8680
RMSE: 0.8782
RMSE: 0.8851
RMSE: 0.8740


In [70]:
rmse_per_num_of_min[20] = np.mean([0.8806, 0.8680, 0.8782, 0.8851, 0.8740])

# Ratings author min 5

In [45]:
n_factors = 10
n_epochs = 150
init_mean= 0
init_std_dev = 0.1
lr_all = 0.0005
reg_all = 0.07
algoSVD_5 = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

In [49]:
kf = KFold(n_splits=5)
rmses_5 = []
for trainset, testset in kf.split(authors_min_5_dataset):

    # train and test algorithm.
    algoSVD_5.fit(trainset)
    predictions = algoSVD_5.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_5.append(accuracy.rmse(predictions, verbose=True))

RMSE: 0.9410
RMSE: 0.9374
RMSE: 0.9375
RMSE: 0.9293
RMSE: 0.9313


In [71]:
rmse_per_num_of_min[5] = np.mean(rmses_5)

# Ratings author min 10

In [51]:
n_factors = 10
n_epochs = 150
init_mean= 0
init_std_dev = 0.1
lr_all = 0.0005
reg_all = 0.07
algoSVD_10 = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

kf = KFold(n_splits=5)
rmses_10 = []
for trainset, testset in kf.split(authors_min_10_dataset):

    # train and test algorithm.
    algoSVD_10.fit(trainset)
    predictions = algoSVD_10.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_10.append(accuracy.rmse(predictions, verbose=True))
    
np.mean(rmses_10)

RMSE: 0.8995
RMSE: 0.9050
RMSE: 0.9104
RMSE: 0.9023
RMSE: 0.9065


0.9047183380297661

In [72]:
rmse_per_num_of_min[10] = np.mean(rmses_10)

# Ratings author min 15

In [52]:
n_factors = 10
n_epochs = 150
init_mean= 0
init_std_dev = 0.1
lr_all = 0.0005
reg_all = 0.07
algoSVD_15 = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

kf = KFold(n_splits=5)
rmses_15 = []
for trainset, testset in kf.split(authors_min_15_dataset):

    # train and test algorithm.
    algoSVD_15.fit(trainset)
    predictions = algoSVD_15.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_15.append(accuracy.rmse(predictions, verbose=True))
    
np.mean(rmses_15)

RMSE: 0.8893
RMSE: 0.8842
RMSE: 0.8943
RMSE: 0.8906
RMSE: 0.8880


0.8892745668110991

In [73]:
rmse_per_num_of_min[15] = np.mean(rmses_15)

# 25

In [53]:
n_factors = 10
n_epochs = 150
init_mean= 0
init_std_dev = 0.1
lr_all = 0.0005
reg_all = 0.07
algoSVD_25 = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

kf = KFold(n_splits=5)
rmses_25 = []
for trainset, testset in kf.split(authors_min_25_dataset):

    # train and test algorithm.
    algoSVD_25.fit(trainset)
    predictions = algoSVD_25.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_25.append(accuracy.rmse(predictions, verbose=True))
    
np.mean(rmses_25)

RMSE: 0.8773
RMSE: 0.8550
RMSE: 0.8732
RMSE: 0.8663
RMSE: 0.8716


0.868672156609982

In [74]:
rmse_per_num_of_min[25] = np.mean(rmses_25)

# 30

In [54]:
n_factors = 10
n_epochs = 150
init_mean= 0
init_std_dev = 0.1
lr_all = 0.0005
reg_all = 0.07
algoSVD_30 = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

kf = KFold(n_splits=5)
rmses_30 = []
for trainset, testset in kf.split(authors_min_30_dataset):

    # train and test algorithm.
    algoSVD_30.fit(trainset)
    predictions = algoSVD_30.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_30.append(accuracy.rmse(predictions, verbose=True))
    
np.mean(rmses_30)

RMSE: 0.8654
RMSE: 0.8606
RMSE: 0.8549
RMSE: 0.8595
RMSE: 0.8643


0.8609516438172335

In [75]:
rmse_per_num_of_min[30] = np.mean(rmses_30)

# 40

In [60]:
n_factors = 10
n_epochs = 150
init_mean= 0
init_std_dev = 0.1
lr_all = 0.0005
reg_all = 0.07
algoSVD_40 = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

kf = KFold(n_splits=5)
rmses_40 = []
for trainset, testset in kf.split(authors_min_40_dataset):

    # train and test algorithm.
    algoSVD_40.fit(trainset)
    predictions = algoSVD_40.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_40.append(accuracy.rmse(predictions, verbose=True))
    
np.mean(rmses_40)

RMSE: 0.8460
RMSE: 0.8460
RMSE: 0.8528
RMSE: 0.8441
RMSE: 0.8508


0.8479486931119986

In [76]:
rmse_per_num_of_min[40] = np.mean(rmses_40)

# 50

In [63]:
n_factors = 10
n_epochs = 150
init_mean= 0
init_std_dev = 0.1
lr_all = 0.0005
reg_all = 0.07
algoSVD_50 = SVD(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all)

kf = KFold(n_splits=5)
rmses_50 = []
for trainset, testset in kf.split(authors_min_50_dataset):

    # train and test algorithm.
    algoSVD_50.fit(trainset)
    predictions = algoSVD_50.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_50.append(accuracy.rmse(predictions, verbose=True))
    
np.mean(rmses_50)

RMSE: 0.8373
RMSE: 0.8358
RMSE: 0.8323
RMSE: 0.8417
RMSE: 0.8361


0.8366287553891965

In [77]:
rmse_per_num_of_min[50] = np.mean(rmses_50)

# Other mins

In [79]:
for i in range(60, 110, 10):
    author_min = sampling.get_rating_with_min_number(ratings_small, i, col_name='AuthorId')    
    authors_min_dataset = Dataset.load_from_df(author_min[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5))) 
    
    
    n_factors = 10
    n_epochs = 150
    init_mean= 0
    init_std_dev = 0.1
    lr_all = 0.0005
    reg_all = 0.07
    algoSVD_min = SVD(n_factors=n_factors,
                  n_epochs=n_epochs, 
                  init_mean=init_mean,
                  init_std_dev = init_std_dev,
                  lr_all = lr_all,
                  reg_all = reg_all)

    kf = KFold(n_splits=5)
    rmses = []
    for trainset, testset in kf.split(authors_min_50_dataset):
        # train and test algorithm.
        algoSVD_min.fit(trainset)
        predictions = algoSVD_min.test(testset)

        # Compute and print Root Mean Squared Error
        rmses.append(accuracy.rmse(predictions, verbose=True))

    rmse_per_num_of_min[i] = np.mean(rmses)

RMSE: 0.8442
RMSE: 0.8318
RMSE: 0.8394
RMSE: 0.8352
RMSE: 0.8321
RMSE: 0.8343
RMSE: 0.8343
RMSE: 0.8419
RMSE: 0.8388
RMSE: 0.8336
RMSE: 0.8384
RMSE: 0.8425
RMSE: 0.8292
RMSE: 0.8352
RMSE: 0.8373
RMSE: 0.8343
RMSE: 0.8357
RMSE: 0.8429
RMSE: 0.8380
RMSE: 0.8326
RMSE: 0.8339
RMSE: 0.8423
RMSE: 0.8323
RMSE: 0.8364
RMSE: 0.8384


In [80]:
rmse_per_num_of_min

{0: 1.217694110082712,
 20: 0.8771800000000001,
 5: 0.9352851165426397,
 10: 0.9047183380297661,
 15: 0.8892745668110991,
 25: 0.868672156609982,
 30: 0.8609516438172335,
 40: 0.8479486931119986,
 50: 0.8366287553891965,
 60: 0.8365198464266438,
 70: 0.8365915560335528,
 80: 0.8365424124726861,
 90: 0.836690673061297,
 100: 0.8366559472872573}

In [81]:
with open('../SVD/rmse_per_num_of_min.obj', 'wb') as pickle_file:
     pickle.dump(rmse_per_num_of_min, pickle_file)

# SVDpp

In [13]:
metrics_svd = {}
metrics_svd["RMSE"] = accuracy.rmse(predictions, verbose=False)
metrics_svd["MAE"] = accuracy.mae(predictions, verbose=False)
print(metrics_svd)

{'RMSE': 0.8825321667914738, 'MAE': 0.527460981547377}


Ocena jest wyznaczana jako:

$$\hat{r}_{ui}=\mu + b_{u} + b_{i} + q_{i}^{T}(p_{u} + |R(u)|^{-1/2}\sum y_{j})$$


In [20]:
algoSVDpp = SVDpp(verbose=True)

In [22]:
algoSVDpp.fit(train_set)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2240010a2e0>

In [23]:
predictions_svdpp = algoSVDpp.test(test_set)

In [24]:
metrics_svdpp = {}
metrics_svdpp["RMSE"] = accuracy.rmse(predictions_svdpp, verbose=False)
metrics_svdpp["MAE"] = accuracy.mae(predictions_svdpp, verbose=False)
print(metrics_svdpp)

{'RMSE': 0.9053626993420172, 'MAE': 0.5183473938673195}
