In [69]:
import pandas as pd
import numpy as np
import scipy.sparse as sps

from scipy.sparse import *

In [70]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [71]:
urm_path = './content/data_train.csv'
urm_all_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

urm_all_df.columns = ["UserID", "ItemID", "Interaction"]

In [72]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

In [73]:
urm_all = sps.coo_matrix((urm_all_df["Interaction"].values,
                          (urm_all_df["UserID"].values, urm_all_df["ItemID"].values)))

urm_all

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [74]:
URM_all = urm_all

### Step 1: Split the data and create the evaluator objects

In [75]:
from Evaluation.Evaluator import EvaluatorHoldout

URM_train_validation, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train_validation, train_percentage = 0.8)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 2983 (22.9%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2536 (19.5%) Users that have less than 1 test interactions


### Step 2: Define hyperparameter set for the desired model, in this case rp3beta

In [76]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {
    "topK": Integer(190, 210),
    "alpha": Real(0.1, 0.3),
    "beta": Real(0.4, 0.6),
    "min_rating": Real(0.05, 0.15),
}

### Step 3: Create SearchBayesianSkopt object, providing the desired recommender class and evaluator objects

In [77]:
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = RP3betaRecommender

hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_validation,
                                         evaluator_test=evaluator_test)

### Step 4: Provide the data needed to create an instance of the model, one trained only on URM_train, the other on URM_train_validation

In [78]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    EARLYSTOPPING_KEYWORD_ARGS = {},
)

In [79]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train_validation],     # For a CBF model simply put [URM_train_validation, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    EARLYSTOPPING_KEYWORD_ARGS = {},
)

### Step 5: Create a result folder and select the number of cases (50 with 30% random is a good number)

In [80]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 10  # using 10 as an example
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

### Step 5: Run!

In [81]:
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 191, 'alpha': 0.15320468622041009, 'beta': 0.4157634766833682, 'min_rating': 0.09626225721420514}
RP3betaRecommender: URM Detected 880 ( 6.8%) users with no interactions.
RP3betaRecommender: URM Detected 461 ( 2.1%) items with no interactions.


RP3betaRecommender: Similarity column 22348 (100.0%), 1618.48 column/sec. Elapsed time 13.81 sec
EvaluatorHoldout: Processed 10042 (100.0%) in 10.68 sec. Users per second: 940
SearchBayesianSkopt: New best config found. Config 0: {'topK': 191, 'alpha': 0.15320468622041009, 'beta': 0.4157634766833682, 'min_rating': 0.09626225721420514} - results: PRECISION: 0.0581358, PRECISION_RECALL_MIN_DEN: 0.1165322, RECALL: 0.1044903, MAP: 0.0251377, MAP_MIN_DEN: 0.0513338, MRR: 0.1736678, NDCG: 0.0974323, F1: 0.0747067, HIT_RATE: 0.3963354, ARHR_ALL_HITS: 0.2081874, NOVELTY: 0.0051864, AVERAGE_POPULARITY: 0.2869823, DIVERSITY_MEAN_INTER_LIST: 0.9428745, DIVERSITY_HERFINDAHL: 0.9942781, COVERAGE_ITEM: 0.5553965, COVERAGE_ITEM_HIT: 0.0593342, ITEMS_IN_GT: 0.7408269, COVERAGE_USER: 0.7709789, COVERAGE_USER_HIT: 0.3055662, USERS_IN_GT: 0.7709789, DIVERSITY_GINI: 0.1114391, SHANNON_ENTROPY: 10.1003223, RATIO_DIVERSITY_HERFINDAHL: 0.9946391, RATIO_DIVERSITY_GINI: 0.3254716, RATIO_SHANNON_ENTROPY: 0.7787

### Check the best model

In [82]:
from Recommenders.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['algorithm_name_recommender', 'algorithm_name_search', 'cutoff_to_optimize', 'exception_list', 'hyperparameters_best', 'hyperparameters_best_index', 'hyperparameters_df', 'metric_to_optimize', 'result_on_earlystopping_df', 'result_on_last', 'result_on_test_best', 'result_on_test_df', 'result_on_validation_best', 'result_on_validation_df', 'time_df', 'time_on_last_df', 'time_on_test_avg', 'time_on_test_total', 'time_on_train_avg', 'time_on_train_total', 'time_on_validation_avg', 'time_on_validation_total'])

In [83]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,topK,alpha,beta,min_rating
0,191,0.153205,0.415763,0.096262
1,195,0.180048,0.430709,0.051321
2,191,0.244912,0.543993,0.076863
3,201,0.283493,0.599987,0.13137
4,206,0.101982,0.490537,0.147859
5,210,0.287552,0.4,0.074654
6,191,0.261103,0.465796,0.072548
7,197,0.117521,0.599472,0.051687
8,190,0.3,0.423306,0.15
9,196,0.251972,0.428469,0.084991


In [84]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.058136,0.116532,0.10449,0.025138,0.051334,0.173668,0.097432,0.074707,0.396335,0.208187,...,0.770979,0.305566,0.770979,0.111439,10.100322,0.994639,0.325472,0.778796,2.428833,0.37993
1,10,0.058116,0.116293,0.104162,0.025094,0.051048,0.172938,0.097078,0.074606,0.396236,0.207534,...,0.770979,0.305489,0.770979,0.118544,10.186441,0.994805,0.346222,0.785437,2.38934,0.383111
2,10,0.050856,0.100886,0.090243,0.021593,0.043064,0.149166,0.083338,0.065053,0.351524,0.178725,...,0.770979,0.271017,0.770979,0.178934,11.06193,0.996066,0.522599,0.852942,1.930246,0.424393
3,10,0.041605,0.083005,0.074299,0.017612,0.035111,0.12312,0.068345,0.053341,0.294065,0.146602,...,0.770979,0.226718,0.770979,0.218929,11.959534,0.998703,0.63941,0.922153,1.166918,0.469281
4,10,0.053595,0.106647,0.095316,0.02306,0.046545,0.160891,0.089165,0.068611,0.368353,0.191829,...,0.770979,0.283992,0.770979,0.147085,10.550136,0.995266,0.429581,0.81348,2.205501,0.401919
5,10,0.058594,0.118221,0.106088,0.025169,0.051889,0.173922,0.098321,0.075492,0.399622,0.208489,...,0.770979,0.3081,0.770979,0.112266,10.04182,0.994049,0.327885,0.774286,2.509984,0.377418
6,10,0.057409,0.114672,0.102646,0.024543,0.049791,0.169197,0.09523,0.073635,0.391058,0.203013,...,0.770979,0.301497,0.770979,0.135242,10.395307,0.995202,0.39499,0.801542,2.289242,0.391372
7,10,0.03801,0.077083,0.06951,0.015759,0.032249,0.111741,0.062754,0.049146,0.272754,0.132144,...,0.770979,0.210288,0.770979,0.201885,11.925596,0.998993,0.58963,0.919536,1.034704,0.478989
8,10,0.058813,0.11776,0.105507,0.025283,0.051683,0.174359,0.098208,0.075526,0.398825,0.209161,...,0.770979,0.307486,0.770979,0.118274,10.16778,0.994621,0.345433,0.783998,2.419057,0.381554
9,10,0.058126,0.116327,0.10421,0.02506,0.051238,0.173147,0.097268,0.074627,0.396535,0.20748,...,0.770979,0.30572,0.770979,0.119867,10.183551,0.99469,0.350085,0.785214,2.406137,0.382497


In [85]:
result_best_on_test = search_metadata["result_on_last"]
result_best_on_test

Unnamed: 0_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.082896,0.147385,0.125281,0.041163,0.072364,0.240702,0.132361,0.099774,0.482982,0.311432,...,0.805298,0.388944,0.805298,0.083438,9.840057,0.994482,0.24089,0.7581,2.450768,0.295912


In [86]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'topK': 190, 'alpha': 0.3, 'beta': 0.4233057604632479, 'min_rating': 0.15}

In [87]:
time_df = search_metadata["time_df"]
time_df

Unnamed: 0,train,validation,test
0,19.994527,10.693831,11.469868
1,19.318479,10.554817,
2,19.42883,10.508951,
3,20.135541,10.662126,
4,20.311627,10.610811,
5,20.657636,10.510118,11.140869
6,19.763679,10.71585,
7,19.710493,10.557808,
8,19.525661,10.504816,11.104589
9,19.697504,10.715818,


In [88]:
exception_list = search_metadata["exception_list"]
exception_list

[None, None, None, None, None, None, None, None, None, None]