In [1]:
from Recommenders.Ensambles import LinearCombination
from Recommenders.KNN.UserKNNCFRecommender import UserKNNCFRecommender
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender

In [2]:
import pandas as pd
import matplotlib.pyplot as pyplot
import numpy as np

In [3]:
np.int = int
np.float = float 
np.bool = bool

In [None]:
!python run_compile_all_cython.py

## Import Dataset

In [4]:
URM_all_dataframe = pd.read_csv('data_train.csv', 
                                sep=",", 
                                header= 0, 
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [5]:
# Move to sparse format
import scipy.sparse as sps

URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values))) ## .values --> numpy array, df[..] --> pd series

In [6]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[200])

EvaluatorHoldout: Ignoring 2534 (19.5%) Users that have less than 1 test interactions


## kNN-Ensamble

In [8]:
# Initialize the two recommenders for the ensamble
kNN_user = UserKNNCFRecommender(URM_train)
kNN_item = ItemKNNCFRecommender(URM_train)

kNN_list = [kNN_user, kNN_item]

UserKNNCFRecommender: URM Detected 548 ( 4.2%) users with no interactions.
UserKNNCFRecommender: URM Detected 210 ( 0.9%) items with no interactions.
ItemKNNCFRecommender: URM Detected 548 ( 4.2%) users with no interactions.
ItemKNNCFRecommender: URM Detected 210 ( 0.9%) items with no interactions.


In [9]:
# Create hyperparameters list
hyperp_user = {
    "topK": 50, 
    "shrink": 100, 
    "similarity": 'cosine', 
    "normalize": True, 
    "feature_weighting": "none", 
    "URM_bias": False,
}

hyperp_item = {
    "topK": 50, 
    "shrink": 100, 
    "similarity": 'cosine', 
    "normalize": True, 
    "feature_weighting": "none", 
    "URM_bias": False,
}

hyperp_list = [hyperp_user, hyperp_item]

In [10]:
# Instantiate the Ensamble
# If you don't specify the weights, the contribute of each Recommender will be the same.
kNNHybrid = LinearCombination(URM_train, recommenders_list= kNN_list, hyperparameters_dicts_list= hyperp_list)

Linear_Combination_Ensamble_Recommender_Class: URM Detected 548 ( 4.2%) users with no interactions.
Linear_Combination_Ensamble_Recommender_Class: URM Detected 210 ( 0.9%) items with no interactions.


In [11]:
# Fit the models in the ensamble
kNNHybrid.fit()

Similarity column 13025 (100.0%), 3441.22 column/sec. Elapsed time 3.78 sec
Successfully fitted Recommender 1 : UserKNNCFRecommender
Similarity column 22348 (100.0%), 3587.47 column/sec. Elapsed time 6.23 sec
Successfully fitted Recommender 2 : ItemKNNCFRecommender


In [14]:
# Make recommendations
kNNHybrid.recommend([2,3,4,5])

[0.5, 0.5]
(2, 4, 22348)
(4, 22348)


[[7,
  19,
  42,
  2,
  5,
  4,
  17,
  3,
  15,
  12,
  9,
  6,
  16,
  81,
  78,
  51,
  50,
  359,
  44,
  14,
  131,
  8,
  26,
  31,
  18,
  229,
  22,
  108,
  90,
  52,
  29,
  69,
  47,
  155,
  949,
  114,
  227,
  11,
  23,
  222,
  13,
  72,
  136,
  276,
  88,
  85,
  290,
  28,
  137,
  62,
  54,
  75,
  21,
  105,
  125,
  130,
  68,
  345,
  32,
  33,
  157,
  158,
  67,
  365,
  80,
  154,
  271,
  221,
  101,
  264,
  121,
  159,
  46,
  30,
  118,
  416,
  422,
  435,
  196,
  300,
  39,
  38,
  338,
  263,
  100,
  199,
  104,
  84,
  140,
  192,
  58,
  174,
  656,
  402,
  20,
  180,
  324,
  102,
  203,
  380,
  289,
  132,
  117,
  237,
  134,
  191,
  167,
  57,
  629,
  96,
  283,
  152,
  475,
  178,
  61,
  223,
  65,
  250,
  206,
  177,
  173,
  92,
  737,
  37,
  48,
  128,
  272,
  317,
  150,
  1095,
  93,
  525,
  277,
  238,
  498,
  1227,
  284,
  79,
  260,
  45,
  322,
  286,
  41,
  202,
  215,
  209,
  311,
  188,
  3069,
  549,
  145,
  241,
  31

## Linear combination hyperparameter tuning with Optuna


In [19]:
import optuna

class SaveResults(object):
    
    def __init__(self):
        self.results_df = pd.DataFrame(columns=["result", "train_time (min)"])
    
    def __call__(self, optuna_study, optuna_trial):
        hyperparam_dict = optuna_trial.params.copy()
        hyperparam_dict["result"] = optuna_trial.values[0]
        
        if not isinstance(self.results_df, pd.DataFrame):
            self.results_df = pd.DataFrame()  # Reinitialize as DataFrame if not already
        
        self.results_df = pd.concat([self.results_df, pd.DataFrame([hyperparam_dict])], ignore_index=True)

In [20]:
def objective_function_KNN_ensamble(optuna_trial):
    # Initialize the two recommenders for the ensamble
    kNN_user = UserKNNCFRecommender(URM_train)
    kNN_item = ItemKNNCFRecommender(URM_train)

    kNN_list = [kNN_user, kNN_item]
    
    # Set hyperparameters for each of the methods in the Ensamble
    # Maybe tune the fact of using merge_topPop before or after the ensamble
    similarity_user = optuna_trial.suggest_categorical("similarity", ['cosine', 'dice', 'jaccard', 'asymmetric', 'tversky', 'euclidean'])
    similarity_item = optuna_trial.suggest_categorical("similarity", ['cosine', 'dice', 'jaccard', 'asymmetric', 'tversky', 'euclidean'])

    hyperp_user = {
    "topK": optuna_trial.suggest_int("topK_user", 5, 1000),
    "shrink": optuna_trial.suggest_int("shrink_user", 0, 1000),
    "similarity": similarity_user, 
    "normalize": optuna_trial.suggest_categorical("normalize_user", [True, False]),
    "feature_weighting": optuna_trial.suggest_categorical("feature_weighting_user", ["BM25", "TF-IDF", "none"]),
    "URM_bias": optuna_trial.suggest_categorical("URM_bias_user", [True, False]),
    "merge_topPop": optuna_trial.suggest_categorical("merge_topPop_user", [True, False]), 
    }
    if hyperp_user["merge_topPop"]:
        hyperp_user["topPop_factor"] = optuna_trial.suggest_float("topPop_factor_user", 1e-8, 1e-1, log=True)
    if similarity_user == "asymmetric":
        hyperp_user["asymmetric_alpha"] = optuna_trial.suggest_float("asymmetric_alpha_user", 0, 2, log=False)
        hyperp_user["normalize"] = True     
    elif similarity_user == "tversky":
        hyperp_user["tversky_alpha"] = optuna_trial.suggest_float("tversky_alpha_user", 0, 2, log=False)
        hyperp_user["tversky_beta"] = optuna_trial.suggest_float("tversky_beta_user", 0, 2, log=False)
        hyperp_user["normalize"] = True 
    elif similarity_user == "euclidean":
        hyperp_user["normalize_avg_row"] = optuna_trial.suggest_categorical("normalize_avg_row_user", [True, False])
        hyperp_user["similarity_from_distance_mode"] = optuna_trial.suggest_categorical("similarity_from_distance_mode_user", ["lin", "log", "exp"])

    hyperp_item = {
        "topK": optuna_trial.suggest_int("topK_item", 5, 1000),
        "shrink": optuna_trial.suggest_int("shrink_item", 0, 1000),
        "similarity": similarity_item, 
        "normalize": optuna_trial.suggest_categorical("normalize_item", [True, False]),
        "feature_weighting": optuna_trial.suggest_categorical("feature_weighting_item", ["BM25", "TF-IDF", "none"]),
        "URM_bias": optuna_trial.suggest_categorical("URM_bias_item", [True, False]),
        "merge_topPop": optuna_trial.suggest_categorical("merge_topPop_item", [True, False]), 
    }
    if hyperp_item["merge_topPop"]:
        hyperp_item["topPop_factor"] = optuna_trial.suggest_float("topPop_factor_item", 1e-8, 1e-1, log=True)
    if similarity_item == "asymmetric":
        hyperp_item["asymmetric_alpha"] = optuna_trial.suggest_float("asymmetric_alpha_item", 0, 2, log=False)
        hyperp_item["normalize"] = True     
    elif similarity_item == "tversky":
        hyperp_item["tversky_alpha"] = optuna_trial.suggest_float("tversky_alpha_item", 0, 2, log=False)
        hyperp_item["tversky_beta"] = optuna_trial.suggest_float("tversky_beta_item", 0, 2, log=False)
        hyperp_item["normalize"] = True 
    elif similarity_item == "euclidean":
        hyperp_item["normalize_avg_row"] = optuna_trial.suggest_categorical("normalize_avg_row_item", [True, False])
        hyperp_item["similarity_from_distance_mode"] = optuna_trial.suggest_categorical("similarity_from_distance_mode_item", ["lin", "log", "exp"])

    hyperp_list = [hyperp_user, hyperp_item]

    # Set the weights of the Ensamble. Optuna will take care of tuning them
    userkNN_contribute = optuna_trial.suggest_float("userkNN_contribute", 0, 1)
    itemkNN_contribute = optuna_trial.suggest_float("itemkNN_contribute", 0, 1)
    weights_list = [userkNN_contribute, itemkNN_contribute]

    # Instantiate the Ensamble
    recommender_instance = LinearCombination(URM_train, 
                                             recommenders_list= kNN_list, 
                                             hyperparameters_dicts_list= hyperp_list,
                                             weights_list= weights_list)    
    
    merge_topPop_ensamble = optuna_trial.suggest_categorical("merge_topPop_ensamble", [True, False])
    if merge_topPop_ensamble:
        topPop_factor_ensamble = optuna_trial.suggest_float("topPop_factor_ensamble", 1e-8, 1e-1, log=True)
    else: 
        topPop_factor_ensamble = 0.0
    recommender_instance.fit(merge_topPop= merge_topPop_ensamble, topPop_factor= topPop_factor_ensamble)
    
    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)
    
    return result_df.loc[10]["MAP_MIN_DEN"]

In [None]:
optuna_study = optuna.create_study(direction="maximize")
        
save_results = SaveResults()
        
optuna_study.optimize(objective_function_KNN_ensamble,
                      callbacks=[save_results],
                      n_trials = 1000)

In [None]:
# recommender_instance = LinearCombination(URM_train + URM_validation, ...)
# recommender_instance.fit(**optuna_study.best_trial.params)

#result_df, _ = evaluator_test.evaluateRecommender(recommender_instance)
#result_df

## Evaluate and save Predictions

In [13]:
# Initialize the two recommenders for the ensamble
kNN_user = UserKNNCFRecommender(URM_train)
kNN_item = ItemKNNCFRecommender(URM_train)

kNN_list_tuned = [kNN_user, kNN_item]

UserKNNCFRecommender: URM Detected 563 ( 4.3%) users with no interactions.
UserKNNCFRecommender: URM Detected 199 ( 0.9%) items with no interactions.
ItemKNNCFRecommender: URM Detected 563 ( 4.3%) users with no interactions.
ItemKNNCFRecommender: URM Detected 199 ( 0.9%) items with no interactions.


In [15]:
hyperp_user = {
    "topK": 381, 
    "shrink": 0, 
    "similarity": 'cosine', 
    # "normalize": True, 
    "feature_weighting": "none", 
    # "URM_bias": False,
    "merge_topPop": False, 
    # "topPop_factor": 0.004402997536537422,
    }

hyperp_item = {
        "topK": 23, 
        "shrink": 14, 
        "similarity": 'tversky', 
        # "normalize": True, 
        "feature_weighting": "TF-IDF", 
        # "URM_bias": False,
        "merge_topPop": True, 
        "topPop_factor": 0.004402997536537422,
        "tversky_alpha": 0.15897114516548666,
        "tversky_beta": 1.9368574349052465,
    }

hyperp_list_tuned = [hyperp_user, hyperp_item]

In [16]:
weights = [0.004770352974176887, 0.7810236750305526]
kNNHybrid_final = LinearCombination(URM_train, recommenders_list= kNN_list_tuned, hyperparameters_dicts_list= hyperp_list_tuned, weights_list= weights)
kNNHybrid_final.fit()

Linear_Combination_Ensamble_Recommender_Class: URM Detected 563 ( 4.3%) users with no interactions.
Linear_Combination_Ensamble_Recommender_Class: URM Detected 199 ( 0.9%) items with no interactions.
Similarity column 13025 (100.0%), 3306.04 column/sec. Elapsed time 3.94 sec
Successfully fitted Recommender 1 : UserKNNCFRecommender
Similarity column 22348 (100.0%), 3104.61 column/sec. Elapsed time 7.20 sec
Successfully fitted Recommender 2 : ItemKNNCFRecommender


In [17]:
kNNHybrid_final.set_URM_train(URM_all)


Linear_Combination_Ensamble_Recommender_Class: Detected 387 ( 3.0%) users with no interactions.


In [18]:
from Utils.PredictionsWriter import PredictionsWriter

predictionsWriter = PredictionsWriter(kNNHybrid_final, URM_all)
predictionsWriter.evaluate_recommender()

EvaluatorHoldout: Ignoring 3110 (23.9%) Users that have less than 1 test interactions
EvaluatorHoldout: Processed 9915 (100.0%) in 27.77 sec. Users per second: 357


Unnamed: 0_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.761228,0.0,0.761228,0.070561,10.242204,0.997098,0.201437,0.788433,1.939472,0.237031


In [19]:
predictionsWriter.write_predictions()

KeyboardInterrupt: 

## Pipeline with RP3Recommender

In [35]:
from Recommenders.Ensambles import PipelineStep
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender

In [36]:
#hyperp = {'alpha': 0.32193553890547405, 'beta': 0.2682175877441568, 'topK': 205, 'normalize_similarity': True} #200
#hyperp = {'alpha': 0.3271287936556111, 'beta': 0.2227174356558911, 'topK': 66, 'normalize_similarity': True} #30
#hyperp = {'alpha': 0.4206525239970085, 'beta': 0.23317274347749048, 'topK': 45, 'normalize_similarity': True} #20
hyperp = {'alpha': 0.26006102452971075, 'beta': 0.18667231091872954, 'topK': 36, 'normalize_similarity': True} #10

In [37]:
pipStep = PipelineStep(URM_train, RP3betaRecommender(URM_train), hyperp, n_relevant_per_user= 10)

RP3betaRecommender: URM Detected 615 ( 4.7%) users with no interactions.
RP3betaRecommender: URM Detected 235 ( 1.1%) items with no interactions.
Pipeline_Step_Ensamble_Recommender_Class: URM Detected 615 ( 4.7%) users with no interactions.
Pipeline_Step_Ensamble_Recommender_Class: URM Detected 235 ( 1.1%) items with no interactions.


In [38]:
pipStep.fit()

RP3betaRecommender: Similarity column 22348 (100.0%), 1976.87 column/sec. Elapsed time 11.30 sec


In [13]:
rank, score = pipStep.recommend(cutoff= 30, remove_zero_scores= True)

In [28]:
lens = [len(rank[i]) for i in range(0, URM_train.shape[0])]
np.unique(lens)

array([  0,   3,  11,  13,  17,  18,  22,  23,  24,  25,  26,  27,  28,
        29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,
        68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
        81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
       107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
       120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
       133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
       146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
       172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
       185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 19

In [15]:
URM_all.getrow(0).toarray()


array([[0., 0., 0., ..., 0., 0., 0.]])

In [17]:
print(len(rank))
print(URM_train.shape[0])

13025
13025


In [12]:
rel_it = pipStep.compute_relevant_items()

200


In [39]:
output = pipStep.compute_output_URM(remove_non_relevant_items= True, n_relevant_items_per_user= 10)
output


Successfully removed items non-relevant to the model.


<13025x18278 sparse matrix of type '<class 'numpy.float64'>'
	with 369134 stored elements in Compressed Sparse Row format>

In [40]:
final_users, final_items = output.shape
print("n_users\tinput: ", URM_train.shape[0], "\toutput: ", final_users)
print("n_items\tinput: ", URM_train.shape[1], "\toutput: ", final_items)
print("reduction factor for items: ", (URM_train.shape[1]-final_items)/URM_train.shape[1])

n_users	input:  13025 	output:  13025
n_items	input:  22348 	output:  18278
reduction factor for items:  0.18211920529801323
