In [2]:
import os
os.chdir( "../")

In [3]:
import pandas as pd
import numpy as np

path = "Dataset/data_train.csv"
df = pd.read_csv(filepath_or_buffer=path,
                               sep=",",
                               header=1,
                               engine='python',
                               names=['UserID', 'ItemID', 'Interaction'])


df

Unnamed: 0,UserID,ItemID,Interaction
0,1,15,1.0
1,1,16,1.0
2,1,133,1.0
3,1,161,1.0
4,1,187,1.0
...,...,...,...
478724,13024,13605,1.0
478725,13024,13823,1.0
478726,13024,15122,1.0
478727,13024,18185,1.0


In [4]:
df.Interaction.value_counts()

1.0    478729
Name: Interaction, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478729 entries, 0 to 478728
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   UserID       478729 non-null  int64  
 1   ItemID       478729 non-null  int64  
 2   Interaction  478729 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 11.0 MB


In [6]:
user_ids = df["UserID"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
item_ids = df["ItemID"].unique().tolist()
item2item_encoded = {x: i for i, x in enumerate(item_ids)}
item_encoded2item = {i: x for i, x in enumerate(item_ids)}
df["User"] = df["UserID"].map(user2user_encoded)
df["Item"] = df["ItemID"].map(item2item_encoded)

num_users = len(user2user_encoded)
num_items = len(item_encoded2item)
df["Interaction"] = df["Interaction"].values.astype(np.float32)

# min and max ratings will be used to normalize the ratings later
min_rating = 0.0
max_rating = max(df["Interaction"])

print(
    "Number of users: {}, Number of Items: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_items, min_rating, max_rating
    )
)

Number of users: 12638, Number of Items: 22222, Min rating: 0.0, Max rating: 1.0


In [7]:
df.head()

Unnamed: 0,UserID,ItemID,Interaction,User,Item
0,1,15,1.0,0,0
1,1,16,1.0,0,1
2,1,133,1.0,0,2
3,1,161,1.0,0,3
4,1,187,1.0,0,4


In [8]:
userId_unique = df["UserID"].unique()
itemId_unique = df["ItemID"].unique()

In [9]:
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.sparse as sps
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample



urm_all = sps.coo_matrix((df["Interaction"].values, 
                          (df["User"].values, df["Item"].values)))

urm_train_validation, urm_test = split_train_in_two_percentage_global_sample(urm_all, train_percentage = 0.80)
urm_train, urm_validation = split_train_in_two_percentage_global_sample(urm_train_validation, train_percentage = 0.80)



In [10]:
num_users = len(userId_unique)
num_items = len(itemId_unique)

In [11]:
from Recommenders.Recommender_import_list import *
from Evaluation.Evaluator import EvaluatorHoldout
evaluator_validation = EvaluatorHoldout(urm_validation, cutoff_list=[10], ignore_users=[])



EvaluatorHoldout: Ignoring 2602 (20.6%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 0 Users


## Insert model here

In [12]:
from Recommenders.Neural.MultVAE_PyTorch_Recommender import MultVAERecommender_PyTorch
model = MultVAERecommender_PyTorch

In [1]:
import optuna as op

def objective(trial):


    hyperparameters_range_dictionary = {
                "epochs": trial.suggest_categorical("epochs",[500]),
                "learning_rate": trial.suggest_float("learning_rate",low=1e-6, high=1e-2, log=True),
                "l2_reg": trial.suggest_float("l2_reg",low=1e-6, high=1e-2, log=True),
                "dropout": trial.suggest_float("dropout", low=0., high=0.8),
                "total_anneal_steps": trial.suggest_int("total_anneal_steps",100000, 600000),
                "anneal_cap": trial.suggest_float("anneal_cap",low=0., high=0.6),
                "batch_size": trial.suggest_categorical("batch_size",[128, 256, 512, 1024]),
                "p_dims": trial.suggest_categorical("p_dims",[[num_users, num_items]]),
                "sgd_mode": trial.suggest_categorical("sgd_mode",["Adam"]),

                #"encoding_size": trial.suggest_int("encoding_size",1, min(512, urm_all.shape[1]-1)),
                #"next_layer_size_multiplier": trial.suggest_int("next_layer_size_multiplier",2, 10),
                #"max_n_hidden_layers": trial.suggest_int("max_n_hidden_layers",1, 4),

                # Constrain the model to a maximum number of parameters so that its size does not exceed 7 GB
                # Estimate size by considering each parameter uses float32
                #"max_parameters": trial.suggest_categorical("max_parameters",[7*1e9*8/32]),
            }




     # Earlystopping hyperparameters available in the framework
    full_hyperp = {"validation_every_n": 5,
                   "stop_on_validation": True,
                   "evaluator_object": evaluator_validation,
                   "lower_validations_allowed": 5,   # Higher values will result in a more "patient" earlystopping
                   "validation_metric": "MAP",
                  }
                      

    
    recommender = model(urm_train, verbose=True)
    recommender.fit(**trial.params, **full_hyperp)
    
    # Add the number of epochs selected by earlystopping as a "user attribute" of the optuna trial
    epochs = recommender.get_early_stopping_final_epochs_dict()["epochs"]
    trial.set_user_attr("epochs", epochs) 

    result, _ = evaluator_validation.evaluateRecommender(recommender)
    MAP_result = result["MAP"].item()
       
    return MAP_result

In [13]:
#best_params = {'topK': 6658, 'l1_ratio': 0.03659854387723134, 'alpha': 0.0012812054186341782}

study = op.create_study(direction="maximize")
#study.enqueue_trial(best_params)
study.optimize(objective, n_trials=30)

[I 2023-12-27 22:57:08,447] A new study created in memory with name: no-name-20d2f918-8e0e-4311-b65e-7ab064dd9222


MultVAERecommender_PyTorch: URM Detected 472 ( 3.7%) users with no interactions.
MultVAERecommender_PyTorch: URM Detected 341 ( 1.5%) items with no interactions.


  user_batch_tensor = torch.sparse_csr_tensor(user_batch_tensor.indptr,
 17%|█▋        | 2/12 [03:24<17:00, 102.03s/it]
[W 2023-12-27 23:00:44,632] Trial 0 failed with parameters: {'epochs': 500, 'learning_rate': 7.498836939485468e-06, 'l2_reg': 0.0009149400773775601, 'dropout': 0.7195442913520576, 'total_anneal_steps': 207729, 'anneal_cap': 0.4629055588938552, 'batch_size': 1024, 'p_dims': [12638, 22222], 'sgd_mode': 'Adam'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/japo/miniconda3/envs/RecSysFramework/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/kj/3wy9xr4j2vlcr59wg5_cqy200000gn/T/ipykernel_2184/1162066407.py", line 40, in objective
    recommender.fit(**trial.params, **full_hyperp)
  File "/Users/japo/RecSys/RecSys-Challenge/Recommenders/Neural/MultVAE_PyTorch_Recommender.py", line 242, in fit
    self._train_with_early_stopping(epoch

KeyboardInterrupt: 

In [None]:
study.trials_dataframe().to_csv("MultVAE_optuna_V1.csv")

In [None]:
study.best_trial

FrozenTrial(number=18, state=TrialState.COMPLETE, values=[0.030364252184200514], datetime_start=datetime.datetime(2023, 12, 8, 22, 37, 13, 551044), datetime_complete=datetime.datetime(2023, 12, 8, 22, 41, 24, 250874), params={'topK': 7765, 'l1_ratio': 0.029537380631026046, 'alpha': 0.0018328668672485519}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'topK': IntDistribution(high=10000, log=False, low=5, step=1), 'l1_ratio': FloatDistribution(high=0.1, log=False, low=0.01, step=None), 'alpha': FloatDistribution(high=0.015, log=False, low=0.0005, step=None)}, trial_id=18, value=None)

In [None]:
study.best_value

0.030364252184200514

In [None]:
study.best_params

{'topK': 7765,
 'l1_ratio': 0.029537380631026046,
 'alpha': 0.0018328668672485519}

In [None]:
final = model(urm_train_validation)
final.fit(**study.best_params)

SLIMElasticNetRecommender: URM Detected 218 ( 1.7%) users with no interactions.
SLIMElasticNetRecommender: URM Detected 110 ( 0.5%) items with no interactions.
SLIMElasticNetRecommender: Processed 22222 (100.0%) in 4.42 min. Items per second: 83.87


In [None]:
from Evaluation.Evaluator import EvaluatorHoldout
evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=[10], ignore_users=[])
evaluator_test.evaluateRecommender(final)

EvaluatorHoldout: Ignoring 2128 (16.8%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 0 Users
EvaluatorHoldout: Processed 10510 (100.0%) in 4.52 sec. Users per second: 2324


(       PRECISION PRECISION_RECALL_MIN_DEN    RECALL       MAP MAP_MIN_DEN  \
 cutoff                                                                      
 10      0.095852                 0.167628  0.142246  0.048987    0.084815   
 
              MRR      NDCG        F1  HIT_RATE ARHR_ALL_HITS  ...  \
 cutoff                                                        ...   
 10      0.274723  0.152679  0.114529  0.526261      0.362642  ...   
 
        COVERAGE_USER COVERAGE_USER_HIT USERS_IN_GT DIVERSITY_GINI  \
 cutoff                                                              
 10          0.831619          0.437648    0.831619       0.049873   
 
        SHANNON_ENTROPY RATIO_DIVERSITY_HERFINDAHL RATIO_DIVERSITY_GINI  \
 cutoff                                                                   
 10           10.047246                   0.997434             0.142905   
 
        RATIO_SHANNON_ENTROPY RATIO_AVERAGE_POPULARITY RATIO_NOVELTY  
 cutoff                                   

In [None]:
opt_df = study.trials_dataframe()