In [30]:
import pandas as pd
import numpy as np
import scipy.sparse as sps

from scipy.sparse import *

In [31]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [32]:
urm_path = './content/data_train.csv'
urm_all_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

urm_all_df.columns = ["UserID", "ItemID", "Interaction"]

In [33]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

In [34]:
urm_all = sps.coo_matrix((urm_all_df["Interaction"].values,
                          (urm_all_df["UserID"].values, urm_all_df["ItemID"].values)))

urm_all

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [35]:
URM_all = urm_all

### Step 1: Split the data and create the evaluator objects

In [36]:
from Evaluation.Evaluator import EvaluatorHoldout

URM_train_validation, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train_validation, train_percentage = 0.8)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 2973 (22.8%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2568 (19.7%) Users that have less than 1 test interactions


### Step 2: Define hyperparameter set for the desired model, in this case rp3beta

In [37]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {
    "topK": Integer(50, 300),
    "alpha": Real(0.000001, 0.03),
    "l1_ratio": Real(0.001, 0.1),
}

### Step 3: Create SearchBayesianSkopt object, providing the desired recommender class and evaluator objects

In [38]:
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = SLIMElasticNetRecommender

hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_validation,
                                         evaluator_test=evaluator_test)

### Step 4: Provide the data needed to create an instance of the model, one trained only on URM_train, the other on URM_train_validation

In [39]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    EARLYSTOPPING_KEYWORD_ARGS = {},
)

In [40]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train_validation],     # For a CBF model simply put [URM_train_validation, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    EARLYSTOPPING_KEYWORD_ARGS = {},
)

### Step 5: Create a result folder and select the number of cases (50 with 30% random is a good number)

In [41]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 5  # using 10 as an example
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

### Step 5: Run!

In [42]:
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 278, 'alpha': 0.00984742352027931, 'l1_ratio': 0.001403680909889978}
SLIMElasticNetRecommender: URM Detected 851 ( 6.5%) users with no interactions.
SLIMElasticNetRecommender: URM Detected 472 ( 2.1%) items with no interactions.
SLIMElasticNetRecommender: Processed 8164 (36.5%) in 5.00 min. Items per second: 27.21
SLIMElasticNetRecommender: Processed 16154 (72.3%) in 10.00 min. Items per second: 26.92
SLIMElasticNetRecommender: Processed 22348 (100.0%) in 14.72 min. Items per second: 25.30
EvaluatorHoldout: Processed 10052 (100.0%) in 12.12 sec. Users per second: 829
SearchBayesianSkopt: New best config found. Config 0: {'topK': 278, 'alpha': 0.00984742352027931, 'l1_ratio': 0.001403680909889978} - results: PRECISION: 0.0657083, PRECISION_RECALL_MIN_DEN: 0.1296048, RECALL: 0.1165827, MAP: 0.0300598, MAP_MIN_DEN: 0.0602312, MRR: 0.2012811, NDCG: 0.1116270, F1: 0.0840464, HIT_RATE:

### Check the best model

In [43]:
from Recommenders.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['algorithm_name_recommender', 'algorithm_name_search', 'cutoff_to_optimize', 'exception_list', 'hyperparameters_best', 'hyperparameters_best_index', 'hyperparameters_df', 'metric_to_optimize', 'result_on_earlystopping_df', 'result_on_last', 'result_on_test_best', 'result_on_test_df', 'result_on_validation_best', 'result_on_validation_df', 'time_df', 'time_on_last_df', 'time_on_test_avg', 'time_on_test_total', 'time_on_train_avg', 'time_on_train_total', 'time_on_validation_avg', 'time_on_validation_total'])

In [44]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,topK,alpha,l1_ratio
0,278,0.009847,0.001404
1,50,0.03,0.1
2,89,0.010706,0.068162
3,259,0.008595,0.013436
4,50,1e-06,0.001


In [45]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.065708,0.129605,0.116583,0.03006,0.060231,0.201281,0.111627,0.084046,0.431954,0.244911,...,0.771747,0.333359,0.771747,0.027408,9.266498,0.996006,0.080028,0.714549,2.382799,0.350454
1,10,0.041106,0.078046,0.069509,0.018894,0.037266,0.13953,0.070587,0.051661,0.298349,0.161863,...,0.771747,0.23025,0.771747,0.002752,6.296981,0.979814,0.008035,0.485567,3.397866,0.363172
2,10,0.055651,0.110311,0.0994,0.025579,0.051144,0.179802,0.095966,0.071353,0.387286,0.213635,...,0.771747,0.298887,0.771747,0.009329,7.92974,0.991355,0.027238,0.611471,3.030277,0.342248
3,10,0.064266,0.126776,0.114105,0.029503,0.058895,0.199312,0.109539,0.082222,0.427179,0.241589,...,0.771747,0.329674,0.771747,0.023647,9.092968,0.99558,0.069044,0.701168,2.491025,0.347372
4,10,0.010088,0.017437,0.014783,0.004751,0.008955,0.041353,0.01755,0.011992,0.08645,0.044174,...,0.771747,0.066718,0.771747,0.382638,13.390054,1.000192,1.117242,1.032521,0.22439,0.472838


In [46]:
result_best_on_test = search_metadata["result_on_last"]
result_best_on_test

Unnamed: 0_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.091652,0.159798,0.135013,0.046475,0.079046,0.2639,0.144797,0.109185,0.510663,0.346577,...,0.802841,0.409981,0.802841,0.031239,9.49176,0.996746,0.090115,0.731267,2.161043,0.283342


In [47]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'topK': 278, 'alpha': 0.00984742352027931, 'l1_ratio': 0.001403680909889978}

In [48]:
time_df = search_metadata["time_df"]
time_df

Unnamed: 0,train,validation,test
0,883.481501,12.134131,12.94904
1,414.26564,8.227705,
2,419.957088,9.146699,
3,727.276564,12.022983,
4,3855.19254,9.597985,


In [49]:
exception_list = search_metadata["exception_list"]
exception_list

[None, None, None, None, None]

# Evaluation

In [None]:
recommender = SLIMElasticNetRecommender(URM_train)
recommender.fit(alpha = 0.0015746723778813712, l1_ratio = 0.005,topK = 100)

result_df, _ = evaluator_validation.evaluateRecommender(recommender)

In [None]:
result_df

In [None]:
recommender = SLIMElasticNetRecommender(URM_train)
recommender.fit(alpha = 0.02147558429127613, l1_ratio = 0.09078609701165972,topK = 67)

result_df, _ = evaluator_validation.evaluateRecommender(recommender)
result_df