## Overview

Optimizing hyperparameters for my recommender algorithms on the smaller subsets of data that I sampled earlier.

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats
import scipy as sci

In [12]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csr_matrix
from mlxtend.frequent_patterns import apriori, association_rules

# Function to reduce the memory usage of a DataFrame.
def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

# Generator function to load data in chunks.
def data_generator(df, chunksize=10000):
    for i in range(0, df.shape[0], chunksize):
        yield df.iloc[i:i+chunksize]


        
        
comp_recs = reduce_memory(pd.read_csv('../Data/sampled_composite_recommendations.csv'))
recs = reduce_memory(pd.read_csv('../Data/sampled_recommendations.csv'))

In [18]:
from surprise import Dataset
from surprise.reader import Reader
from surprise.prediction_algorithms.matrix_factorization import SVD as FunkSVD
from surprise.model_selection import GridSearchCV
from surprise import SVDpp
from surprise import SVD

In [14]:
recs['is_recommended'] = recs['is_recommended'].astype(int)

In [15]:
# Loading in the is recommended dataframe as a surprise dataframe and telling it the column order and rating scale 

rec_dataset = Dataset.load_from_df(recs[['user_id','app_id','is_recommended']], Reader(rating_scale=(0, 1)))
my_train_dataset = rec_dataset.build_full_trainset()

In [16]:
# Same as above but for the comp dataframe 

comp_recs_dataset = Dataset.load_from_df(comp_recs[['user_id','app_id','composite_rating']], Reader(rating_scale=(0, 10)))
my_train_dataset_comp = comp_recs_dataset.build_full_trainset()

The two commented out blocks of code below are my gridsearches, you can see the best parameters and the iteration that was run each time as well.  They're commented out so that you can test them if you like but they can take a very long time to run.

In [12]:
# # Grid search that was used for the recs dataset (0-1 scale) commented out to avoid runtime


 
# learning_rate = np.logspace(-4, -1, num=5)

# parameters = {
#     'n_factors': [11],
#     'lr_all': [0.003],
#     'reg_all': [0.05],
#     'n_epochs': [55,60,65]
    
# }


# # {'n_factors': 20, 'lr_all': 0.0031622776601683794, 'reg_all': 0.05, 'n_epochs': 30} first iteration

# # 0.3258830122922809
# #{'n_factors': 15, 'lr_all': 0.003, 'reg_all': 0.05, 'n_epochs': 50} second iteration, not much better 

# #0.3258285888565499
# #{'n_factors': 11, 'lr_all': 0.003, 'reg_all': 0.05, 'n_epochs': 60}


# # 0.32581415433422056
# # {'n_factors': 11, 'lr_all': 0.003, 'reg_all': 0.05, 'n_epochs': 55} # fourth iteration, thats the final score for is reocmmended


# gs = GridSearchCV(SVD, parameters, measures=['rmse', 'mae'], n_jobs=-1, cv=3, joblib_verbose=10)

# # Fit the grid search to the data
# gs.fit(my_dataset)

# # Best RMSE score
# print(gs.best_score['rmse'])

# # Combination of hyperparameters that gave the best RMSE score
# print(gs.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:   45.3s remaining:  2.6min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:   49.8s remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:   57.0s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:  1.0min remaining:   49.1s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  1.1min remaining:   33.4s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:  1.2min remaining:   20.9s


0.32581415433422056
{'n_factors': 11, 'lr_all': 0.003, 'reg_all': 0.05, 'n_epochs': 55}


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  1.4min finished


In [18]:
# # Grid search that was used for the comp recs dataset (0-10 scale) commented out to avoid runtime

# learning_rate = np.logspace(-4, -1, num=5)

# parameters = {
#     'n_factors': [10,15,20],
#     'lr_all': [0.1],
#     'reg_all': [0.15],
#     'n_epochs': [13,15,11]
    
# }

# # 1.82 RMSE
# # "n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1 '" Previous best parameters for 1-10 scale 


# # 1.244827687220985 RMSE
# # {'n_factors': 20, 'lr_all': 0.1, 'reg_all': 0.15, 'n_epochs': 15} # first iteration


# # 1.2417033081913984
# # {'n_factors': 15, 'lr_all': 0.1, 'reg_all': 0.15, 'n_epochs': 13} second iteration

# # 1.2612799231507899
# # {'n_factors': 10, 'lr_all': 0.1, 'reg_all': 0.15, 'n_epochs': 11} third iteration, underperformed compared to second



# gs = GridSearchCV(SVD, parameters, measures=['rmse', 'mae'], n_jobs=-1, cv=3, joblib_verbose=10)


# gs.fit(my_dataset_rating)


# print(gs.best_score['rmse'])


# print(gs.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  27 | elapsed:   25.5s remaining:  5.3min
[Parallel(n_jobs=-1)]: Done   5 out of  27 | elapsed:   43.3s remaining:  3.2min
[Parallel(n_jobs=-1)]: Done   8 out of  27 | elapsed:   59.5s remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  11 out of  27 | elapsed:  1.3min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  14 out of  27 | elapsed:  1.5min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  17 out of  27 | elapsed:  1.8min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  20 out of  27 | elapsed:  2.1min remaining:   44.0s
[Parallel(n_jobs=-1)]: Done  23 out of  27 | elapsed:  2.4min remaining:   24.8s


1.2612799231507899
{'n_factors': 10, 'lr_all': 0.1, 'reg_all': 0.15, 'n_epochs': 11}


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  2.7min finished


In [20]:
# would adjust ratings based on gridsearch factors and append the results into a dataframe below
  
    
# top performing algorithm for composite rating dataset 
my_algorithm_comp = SVDpp(n_factors=15,
                     n_epochs=13,
                     lr_all=0.1,
                     reg_all=0.15,
                     cache_ratings=True,
                     verbose=1)

my_algorithm_comp.fit(my_train_dataset_comp)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x178af80e050>

In [21]:
# top performing for 0-1 scale rec dataset 

my_algorithm_rec = SVDpp(n_factors=11,
                     n_epochs=55,
                     lr_all=0.003,
                     reg_all=0.05,
                     cache_ratings=True,
                     verbose=1)

my_algorithm_rec.fit(my_train_dataset)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing epoch 47
 p

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1788ce05550>

## NOTE:

The below code was used to test and record the performance of each algorithm in the dataset.  Once you've found the best parameters and fit your dataset you can simply pickle it as shown below for later use. 

In [23]:
from surprise import accuracy
from surprise.model_selection import train_test_split

# The surprise package doesn't allow you to test on the trainset we built

# tester for comp dataset 
my_train_dataset, my_test_dataset = train_test_split(comp_recs_dataset, test_size=0.5)


predictions = my_algorithm_comp.test(my_test_dataset)

In [27]:
# tester for rec dataset, preferably test one at a time
my_train_dataset, my_test_dataset = train_test_split(rec_dataset, test_size=0.5)


predictions = my_algorithm_rec.test(my_test_dataset)

In [29]:
# Rerun after every model train session to update scores 

RMSE = round(accuracy.rmse(predictions, verbose=False), 2)
MSE = round(accuracy.mse(predictions, verbose=False), 2)
MAE = round(accuracy.mae(predictions, verbose=False), 2)
FCP = round(accuracy.fcp(predictions, verbose=False), 2)

In [31]:
print(f'RMSE:{RMSE},MSE:{MSE},MAE:{MAE},FCP:{FCP}') # if you don't want to add to dataframe but want to see a random test

RMSE:0.28,MSE:0.08,MAE:0.18,FCP:0.6


In [None]:
# results dataframe
algo_results = pd.DataFrame(columns=['RMSE', 'MSE', 'MAE', 'FCP'])

In [26]:
# updates dataframe with details on test, run a new one after each test 

algo_results.loc['Ratings test SVD: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13'] = [
    f'{RMSE}',
    f'{MSE}',
    f'{MAE}',
    f'{FCP}']
    

algo_results

Unnamed: 0,RMSE,MSE,MAE,FCP
"Ratings test: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
"Ratings test SVD: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52


In [30]:
algo_results.loc['Ratings test SVDpp: Original values (fixed)'] = [
    f'{RMSE}',
    f'{MSE}',
    f'{MAE}',
    f'{FCP}']
    

algo_results

Unnamed: 0,RMSE,MSE,MAE,FCP
"Ratings test: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
"Ratings test SVD: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
Ratings test SVDpp: Original values (fixed),0.82,0.68,0.63,0.59


In [34]:
algo_results.loc['Ratings test SVDpp: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13'] = [
    f'{RMSE}',
    f'{MSE}',
    f'{MAE}',
    f'{FCP}']
    

algo_results

Unnamed: 0,RMSE,MSE,MAE,FCP
"Ratings test: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
"Ratings test SVD: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
Ratings test SVDpp: Original values (fixed),0.82,0.68,0.63,0.59
"Ratings test SVDpp: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",0.78,0.61,0.57,0.59


In [54]:
algo_results.loc['Recommended test SVD (0-1): {n_factors: 11, lr_all: 0.003, reg_all: 0.05, n_epochs: 55'] = [
    f'{RMSE}',
    f'{MSE}',
    f'{MAE}',
    f'{FCP}']
    

algo_results

Unnamed: 0,RMSE,MSE,MAE,FCP
"Ratings test: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
"Ratings test SVD: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
Ratings test SVDpp: Original values (fixed),0.82,0.68,0.63,0.59
"Ratings test SVDpp: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",0.78,0.61,0.57,0.59
"Recommended test SVDpp (0-1): {n_factors: 11, lr_all: 0.003, reg_all: 0.05, n_epochs: 55",0.28,0.08,0.18,0.59
"Recommended test SVD (0-1): {n_factors: 11, lr_all: 0.003, reg_all: 0.05, n_epochs: 55",0.3,0.09,0.2,0.56


In [50]:
algo_results.loc['Recommended test SVDpp (0-1): {n_factors: 11, lr_all: 0.003, reg_all: 0.05, n_epochs: 55'] = [
    f'{RMSE}',
    f'{MSE}',
    f'{MAE}',
    f'{FCP}']
    

algo_results

Unnamed: 0,RMSE,MSE,MAE,FCP
"Ratings test: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
"Ratings test SVD: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
Ratings test SVDpp: Original values (fixed),0.82,0.68,0.63,0.59
"Ratings test SVDpp: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",0.78,0.61,0.57,0.59
"Recommended test SVDpp (0-1): {n_factors: 11, lr_all: 0.003, reg_all: 0.05, n_epochs: 55",0.28,0.08,0.18,0.59
"Recommended test SVD (0-1): {n_factors: 11, lr_all: 0.003, reg_all: 0.05, n_epochs: 55",0.28,0.08,0.18,0.59


In [56]:
algo_results # ratings test with svdpp appears to be the best model.

Unnamed: 0,RMSE,MSE,MAE,FCP
"Ratings test: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
"Ratings test SVD: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",6.28,39.44,6.21,0.52
Ratings test SVDpp: Original values (fixed),0.82,0.68,0.63,0.59
"Ratings test SVDpp: n_factors: 15, lr_all: 0.1, reg_all: 0.15, n_epochs: 13",0.78,0.61,0.57,0.59
"Recommended test SVDpp (0-1): {n_factors: 11, lr_all: 0.003, reg_all: 0.05, n_epochs: 55",0.28,0.08,0.18,0.59
"Recommended test SVD (0-1): {n_factors: 11, lr_all: 0.003, reg_all: 0.05, n_epochs: 55",0.3,0.09,0.2,0.56


In [32]:
# pickling composite rating algorithm for later use 
import pickle


filename = '../Models/composite_rating_predictor.pkl'
with open(filename, 'wb') as file:
    pickle.dump(my_algorithm_comp, file)

In [33]:
# pickling is_recommended algorithm for later use 

filename = '../Models/recommender_predictor.pkl'
with open(filename, 'wb') as file:
    pickle.dump(my_algorithm_rec, file)