#### Data Visualization

In [28]:
import pandas as pd
import matplotlib.pyplot as pyplot
import numpy as np

In [29]:
np.int = int
np.float = float 
np.bool = bool

In [30]:
URM_all_dataframe = pd.read_csv('data_train.csv', 
                                sep=",", 
                                header= 0, 
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [31]:
URM_all_dataframe.head(n=100)

Unnamed: 0,UserID,ItemID,Interaction
0,1,7,1.0
1,1,15,1.0
2,1,16,1.0
3,1,133,1.0
4,1,161,1.0
...,...,...,...
95,4,47,1.0
96,4,70,1.0
97,4,79,1.0
98,4,119,1.0


In [12]:
print ("The number of interactions is {}".format(len(URM_all_dataframe)))

The number of interactions is 478730


In [32]:
# Extract the list of unique user id and item id 
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

In [33]:
# Display some statistics
n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024



In [34]:
# Move to sparse format
import scipy.sparse as sps

URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values))) ## .values --> numpy array, df[..] --> pd series


In [35]:
N_USERS_TRAIN, N_ITEMS_TRAIN = URM_all.shape
print ("Training set: Number of items\t {}, Number of users\t {}".format(N_ITEMS_TRAIN, N_USERS_TRAIN))

Training set: Number of items	 22348, Number of users	 13025


Maybe removing all the items with no interaction (or no very positive interaction, in our case) can be useful!

In [36]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train_validation, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train_validation, train_percentage = 0.8)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 2941 (22.6%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2517 (19.3%) Users that have less than 1 test interactions


In [42]:
from skopt.space import Real, Integer, Categorical
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs

In [39]:
n_cases = 10  # using 10 as an example
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [40]:
hyperparameters_range_dictionary = {
    "epochs": Categorical([800]),
    "num_factors": Integer(1, 200),
    "sgd_mode": Categorical(["sgd", "adagrad", "adam"]),
    "batch_size": Categorical([1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]),
    "item_reg": Real(low = 1e-5, high = 1e-1, prior = 'log-uniform'),
    "user_reg": Real(low = 1e-5, high = 1e-1, prior = 'log-uniform'),
    "learning_rate": Real(low = 1e-4, high = 1e-1, prior = 'log-uniform'),
}

In [41]:
earlystopping_keywargs = {"validation_every_n": 5,
                          "stop_on_validation": True,
                          "evaluator_object": evaluator_validation,
                          "lower_validations_allowed": 5,
                          "validation_metric": metric_to_optimize,
                          }

In [43]:
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    EARLYSTOPPING_KEYWORD_ARGS = earlystopping_keywargs,     # Additional hyperparameters for the fit function
)

In [44]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train_validation],     # For a CBF model simply put [URM_train_validation, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    EARLYSTOPPING_KEYWORD_ARGS = earlystopping_keywargs,     # Additional hyperparameters for the fit function
)

In [51]:
from Recommenders.MatrixFactorization.Cython.MatrixFactorization_Cython import MatrixFactorization_SVDpp_Cython
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt


recommender_class = MatrixFactorization_SVDpp_Cython

hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_validation,
                                         evaluator_test=evaluator_test)

In [47]:
import os

output_folder_path = "result_experiments_FUNK/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 10  # using 10 as an example
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [52]:
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'epochs': 800, 'num_factors': 167, 'sgd_mode': 'adam', 'batch_size': 512, 'item_reg': 7.520439410975463e-05, 'user_reg': 0.0029926841643206754, 'learning_rate': 0.00042249375654328735}
MatrixFactorization_SVDpp_Cython_Recommender: URM Detected 875 ( 6.7%) users with no interactions.
MatrixFactorization_SVDpp_Cython_Recommender: URM Detected 457 ( 2.0%) items with no interactions.
SVD++: Processed 306688 (100.0%) in 17.74 sec. MSE loss 6.50E-01. Sample per second: 17286
SVD++: Epoch 1 of 800. Elapsed time 17.23 sec
SVD++: Processed 306688 (100.0%) in 17.79 sec. MSE loss 2.11E-01. Sample per second: 17241
SVD++: Epoch 2 of 800. Elapsed time 34.27 sec
SVD++: Processed 306688 (100.0%) in 18.31 sec. MSE loss 7.70E-02. Sample per second: 16748
SVD++: Epoch 3 of 800. Elapsed time 51.81 sec
SVD++: Processed 306688 (100.0%) in 17.53 sec. MSE loss 3.48E-02. Sample per second: 17496
SVD++: Epoch 4 

In [53]:
from Recommenders.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['algorithm_name_recommender', 'algorithm_name_search', 'cutoff_to_optimize', 'exception_list', 'hyperparameters_best', 'hyperparameters_best_index', 'hyperparameters_df', 'metric_to_optimize', 'result_on_earlystopping_df', 'result_on_last', 'result_on_test_best', 'result_on_test_df', 'result_on_validation_best', 'result_on_validation_df', 'time_df', 'time_on_last_df', 'time_on_test_avg', 'time_on_test_total', 'time_on_train_avg', 'time_on_train_total', 'time_on_validation_avg', 'time_on_validation_total'])

In [54]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,epochs,num_factors,sgd_mode,batch_size,item_reg,user_reg,learning_rate
0,5,167,adam,512,7.5e-05,0.002993,0.000422
1,5,41,adagrad,1024,0.02605,1.6e-05,0.007689
2,5,150,adam,8,0.001072,0.000452,0.001032
3,50,127,adagrad,512,0.000138,0.001145,0.00183
4,5,158,adagrad,1,1.4e-05,9.5e-05,0.099773
5,5,2,sgd,1,0.000231,5.1e-05,0.000154
6,5,166,sgd,32,0.004222,0.003445,0.000693
7,5,1,sgd,1024,0.010241,0.037251,0.01067
8,10,6,adagrad,4,0.000187,0.098912,0.01188
9,290,56,adagrad,1024,0.1,0.1,0.001592


In [55]:
time_df = search_metadata["time_df"]
time_df

Unnamed: 0,train,validation,test
0,627.751454,15.741757,16.566189
1,211.506473,15.151663,16.267511
2,601.166981,15.809643,
3,1141.529805,15.326785,
4,577.319536,16.325424,
5,97.348252,14.98931,
6,228.711997,16.689337,
7,95.149482,14.41711,
8,129.597735,17.202142,
9,2524.702591,7.473447,7.881483


In [56]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'epochs': 290,
 'num_factors': 56,
 'sgd_mode': 'adagrad',
 'batch_size': 1024,
 'item_reg': 0.1,
 'user_reg': 0.1,
 'learning_rate': 0.0015918885754011409}