In [1]:
import sys
import optuna
import numpy as np
from scipy.sparse import csr_array
import math

sys.path.append("..")
from herec.utils import *

from eTREE import eTREE
from IHSR import IHSR

from dotenv import load_dotenv
import mlflow
load_dotenv("../.env")



In [8]:
class train:
    
    def objective(self, trial):

        with mlflow.start_run(experiment_id=self.experiment_id) as run:

            # Get Hyper-parameter Setting
            hyparams = self.suggester.suggest_hyparam(trial)

            self.seed

            # Save Hyper-parameter to MLFlow
            mlflow.log_params(hyparams["model"])
            mlflow.log_dict(hyparams, "params.json")

            # Save memo to MLFlow
            if self.memo is not None:
                mlflow.set_tag('memo', self.memo)
        
            # Define Model
            if self.modelName == "eTREE":
                model = eTREE(
                    R = hyparams["model"]["R"],
                    item_clusters = hyparams["model"]["item_clusters"],
                    lbd = hyparams["model"]["lbd"],
                    mu = hyparams["model"]["mu"],
                    eta = hyparams["model"]["eta"],
                    seed = self.seed,
                    run = run,
                )
            elif self.modelName == "IHSR":
                model = IHSR(
                    d = hyparams["model"]["d"],
                    n_by_level = {**{1: hyparams["model"]["userClusterNum"]}, **{l+1: max(math.ceil(hyparams["model"]["userClusterNum"] / (2**l)), 1) for l in range(1, hyparams["model"]["userClusterDepth"])}},
                    m_by_level = {**{1: hyparams["model"]["itemClusterNum"]}, **{l+1: max(math.ceil(hyparams["model"]["itemClusterNum"] / (2**l)), 1) for l in range(1, hyparams["model"]["itemClusterDepth"])}},
                    lam = hyparams["model"]["lam"],
                    seed = self.seed,
                    run = run,
                )

            print({**{1: hyparams["model"]["userClusterNum"]}, **{l+1: max(math.ceil(hyparams["model"]["userClusterNum"] / (2**l)), 1) for l in range(1, hyparams["model"]["userClusterDepth"])}})

            # Train
            model.fit(self.X, self.W, self.X_VALID, self.W_VALID)
            print()

        return model.best_valid_loss

    def readyMLflow(self):

        load_dotenv(".env")

        EXPERIMENT_NAME = f"{self.datasetName}-{self.modelName}-TRAIN"
        if (experiment := mlflow.get_experiment_by_name(EXPERIMENT_NAME)) is None:
            self.experiment_id = mlflow.create_experiment(name=EXPERIMENT_NAME)
        else:
            self.experiment_id = experiment.experiment_id

        print("Experiment Name:", EXPERIMENT_NAME)
        print("Experiment ID:", self.experiment_id)

        return self

    def __init__(self, modelName, datasetName, suggester, seed, memo=None):

        """
            func: training and testing of specified model on a dataset
            args:
                modelName: name of model
                datasetName: name of dataset
                suggester: suggester of hyperparameter
                seed: seed of optune and model initializer
        """

        # Set args.
        self.modelName = modelName
        self.datasetName = datasetName
        self.suggester = suggester
        self.seed = seed
        self.memo = memo

        # Setup MLflow
        self.readyMLflow()

        # Load Dataset for Training
        DATA = getDataset(self.datasetName, self.seed, "train")

        # Generate Matrices
        shape = (DATA["user_num"], DATA["item_num"])
        data = DATA["df_TRAIN"]["rating"].to_numpy()
        row = DATA["df_TRAIN"]["user_id"].to_numpy()
        col = DATA["df_TRAIN"]["item_id"].to_numpy()
        self.X = csr_array((data, (row, col)), shape=shape).toarray()
        self.W = np.where(self.X == 0, 0., 1.)
        data = DATA["df_EVALUATION"]["rating"].to_numpy()
        row = DATA["df_EVALUATION"]["user_id"].to_numpy()
        col = DATA["df_EVALUATION"]["item_id"].to_numpy()
        self.X_VALID = csr_array((data, (row, col)), shape=shape).toarray()
        self.W_VALID = np.where(self.X_VALID == 0, 0., 1.)

        # TPE
        study = optuna.create_study( sampler=optuna.samplers.TPESampler(seed=self.seed) )
        study.optimize( self.objective, n_trials=100 )

In [None]:
modelName, suggester = "IHSR", hyParamSuggester(["../setting/model/IHSR.yaml"])
for datasetName in ["ML100K", "ML1M", "Ciao_PART", "Ciao", "Yelp"]:
    for seed in range(3):
        train( modelName, datasetName, suggester, seed, memo="des" )

[I 2023-12-29 21:28:46,367] A new study created in memory with name: no-name-f7d6fcc2-d56d-4432-a70d-b2e230e93afb


Experiment Name: ML100K-IHSR-TRAIN
Experiment ID: 223
shape of df_TRAIN: (59205, 4)
shape of df_EVALUATION: (2224, 4)
User #: 584
Item #: 1507
{1: 607, 2: 304, 3: 152}
[Step2] done!
[Step3-5] done!
[Step6-8] done!
[Step9] done!


[Step10-20]:   4%|▍         | 9/200 [00:14<04:57,  1.56s/it]





[I 2023-12-29 21:29:14,455] Trial 0 finished with value: 1.4523714742063112 and parameters: {'d': 122, 'userClusterDepth': 3, 'userClusterNum': 607, 'itemClusterDepth': 2, 'itemClusterNum': 429, 'lam': 0.07505241622349541}. Best is trial 0 with value: 1.4523714742063112.


{1: 964, 2: 482, 3: 241}
[Step2] done!
[Step3-5] done!
[Step6-8] done!
[Step9] done!


[Step10-20]:   7%|▋         | 14/200 [00:20<04:27,  1.44s/it]





[I 2023-12-29 21:29:50,659] Trial 1 finished with value: 1.5207448329408193 and parameters: {'d': 73, 'userClusterDepth': 3, 'userClusterNum': 964, 'itemClusterDepth': 2, 'itemClusterNum': 794, 'lam': 0.014906288366101634}. Best is trial 0 with value: 1.4523714742063112.


{1: 80, 2: 40, 3: 20}
[Step2] done!
[Step3-5] done!
[Step6-8] done!
[Step9] done!


[Step10-20]:   6%|▌         | 12/200 [00:18<04:56,  1.58s/it]





[I 2023-12-29 21:30:18,608] Trial 2 finished with value: 1.1363999422010047 and parameters: {'d': 134, 'userClusterDepth': 3, 'userClusterNum': 80, 'itemClusterDepth': 1, 'itemClusterNum': 30, 'lam': 0.9901912249089659}. Best is trial 2 with value: 1.1363999422010047.


{1: 979, 2: 490, 3: 245}
[Step2] done!
[Step3-5] done!
[Step6-8] done!
[Step9] done!


[Step10-20]:  28%|██▊       | 55/200 [01:08<03:01,  1.25s/it]





[I 2023-12-29 21:31:52,887] Trial 3 finished with value: 1.1461791629721598 and parameters: {'d': 356, 'userClusterDepth': 3, 'userClusterNum': 979, 'itemClusterDepth': 3, 'itemClusterNum': 467, 'lam': 0.48214210285277564}. Best is trial 2 with value: 1.1363999422010047.


{1: 152, 2: 76}
[Step2] done!
[Step3-5] done!
[Step6-8] done!
[Step9] done!


[Step10-20]:   8%|▊         | 15/200 [00:20<04:10,  1.35s/it]





[I 2023-12-29 21:32:22,632] Trial 4 finished with value: 1.5137017621042341 and parameters: {'d': 16, 'userClusterDepth': 2, 'userClusterNum': 152, 'itemClusterDepth': 3, 'itemClusterNum': 527, 'lam': 0.0030758959477348562}. Best is trial 2 with value: 1.1363999422010047.


{1: 462, 2: 231, 3: 116}
[Step2] done!
[Step3-5] done!
[Step6-8] done!
[Step9] done!


[Step10-20]:   5%|▌         | 10/200 [00:14<04:40,  1.48s/it]





[I 2023-12-29 21:32:44,362] Trial 5 finished with value: 1.2920874017101054 and parameters: {'d': 33, 'userClusterDepth': 3, 'userClusterNum': 462, 'itemClusterDepth': 2, 'itemClusterNum': 28, 'lam': 0.05079406641139767}. Best is trial 2 with value: 1.1363999422010047.
