# Imports


In [1]:
import pandas as pd
import pickle
import sampling
from collections import defaultdict
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise.model_selection import KFold
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
import random
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV
my_seed = 13
random.seed(my_seed)

# Constants

In [2]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

# Load data


In [3]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')
recipe_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='RecipeId')

ratings_min_20 = author_min_20.merge(recipe_min_20, how='inner')

author_min_5 = sampling.get_rating_with_min_number(ratings_small, 5, col_name='AuthorId')
author_min_10 = sampling.get_rating_with_min_number(ratings_small, 10, col_name='AuthorId')
author_min_15 = sampling.get_rating_with_min_number(ratings_small, 15, col_name='AuthorId')
author_min_25 = sampling.get_rating_with_min_number(ratings_small, 25, col_name='AuthorId')
author_min_30 = sampling.get_rating_with_min_number(ratings_small, 30, col_name='AuthorId')

author_min_40 = sampling.get_rating_with_min_number(ratings_small, 40, col_name='AuthorId')

author_min_50 = sampling.get_rating_with_min_number(ratings_small, 50, col_name='AuthorId')

# Create surprise dataset

In [4]:
ratings_min_20_dataset = Dataset.load_from_df(ratings_min_20[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_20_dataset = Dataset.load_from_df(author_min_20[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
all_ratings_dataset = Dataset.load_from_df(ratings_small[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))

authors_min_5_dataset = Dataset.load_from_df(author_min_5[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_10_dataset = Dataset.load_from_df(author_min_10[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_15_dataset = Dataset.load_from_df(author_min_15[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_25_dataset = Dataset.load_from_df(author_min_25[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))
authors_min_30_dataset = Dataset.load_from_df(author_min_30[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))


authors_min_40_dataset = Dataset.load_from_df(author_min_40[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))

authors_min_50_dataset = Dataset.load_from_df(author_min_50[['AuthorId', 'RecipeId', 'Rating']], Reader(rating_scale=(0, 5)))

ratings_train_set, ratings_test_set = train_test_split(ratings_min_20_dataset, train_size=0.8)
authors_train_set, authors_test_set = train_test_split(authors_min_20_dataset, train_size=0.8)
all_train_set, all_test_set = train_test_split(all_ratings_dataset, train_size=0.8)
# full_trainset = user_item_ratings_dataset.build_full_trainset()
# full_testset = full_trainset.build_testset()

# SVDpp

Ocena jest wyznaczana jako:

$$\hat{r}_{ui}=\mu + b_{u} + b_{i} + q_{i}^{T}(p_{u} + |R(u)|^{-1/2}\sum y_{j})$$


In [5]:
# param_grid = {'n_epochs': [50, 100, 150], 'n_factors': [10, 15, 20], 'lr_all': [0.001, 0.005, 0.0007], 'reg_all': [0.02, 0.08, 0.05] }
# gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3)
# gs.fit(authors_min_20_dataset)

# # best RMSE score
# print(gs.best_score['rmse'])

# # combination of parameters that gave the best RMSE score
# print(gs.best_params['rmse'])
# results_df = pd.DataFrame.from_dict(gs.cv_results)

# algoSVDpp_best = gs.best_estimator['rmse']

In [7]:
n_factors = 10
n_epochs = 50
init_mean= 0
init_std_dev = 0.1
lr_all = 0.005
reg_all = 0.07
algoSVDpp_20 = SVDpp(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all,
                    verbose=True)

kf = KFold(n_splits=5)
rmses_20 = []
for trainset, testset in kf.split(authors_min_20_dataset):
    print('Fitting...')
    # train and test algorithm.
    algoSVDpp_20.fit(trainset)
    
    print('Calculating predictions')
    predictions = algoSVDpp_20.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_20.append(accuracy.rmse(predictions, verbose=True))
    
np.mean(rmses_20)

Fitting...
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing 

KeyboardInterrupt: 

In [None]:
with open('../SVD/svdpp_best_min_20.obj', 'wb') as pickle_file:
     pickle.dump(algoSVDpp_best, pickle_file)

In [None]:
with open('../SVD/svdpp_best_min_20_results_df.obj', 'wb') as pickle_file:
     pickle.dump(results_df, pickle_file)

In [None]:
with open('../SVD/svdpp_gs.obj', 'wb') as pickle_file:
     pickle.dump(gs, pickle_file)

In [6]:
n_factors = 10
n_epochs = 100
init_mean= 0
init_std_dev = 0.1
lr_all = 0.005
reg_all = 0.08
algoSVDpp_20 = SVDpp(n_factors=n_factors,
              n_epochs=n_epochs, 
              init_mean=init_mean,
              init_std_dev = init_std_dev,
              lr_all = lr_all,
              reg_all = reg_all,
                    verbose=True)

kf = KFold(n_splits=5)
rmses_20 = []
for trainset, testset in kf.split(authors_min_20_dataset):
    print('Fitting...')
    # train and test algorithm.
    algoSVDpp_20.fit(trainset)
    
    print('Calculating predictions')
    predictions = algoSVDpp_20.test(testset)

    # Compute and print Root Mean Squared Error
    rmses_20.append(accuracy.rmse(predictions, verbose=True))
    
np.mean(rmses_20)

Fitting...
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing 

KeyboardInterrupt: 

In [7]:
predictions = algoSVDpp_20.test(testset)

In [8]:
accuracy.rmse(predictions, verbose=True)

RMSE: 0.9163


0.9162763881700828