# Hyperparameter optimization and model finalization

This notebook trains and optimizes the RF and XGBoost model for **combined** dataset.

In [1]:
import os
import pickle
import json
import optuna
import pandas as pd
import logging

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from hpo_trainer import objective_rf, SEED, finalize_model, objective_xgboost
from utils import label_to_idx

In [2]:
import warnings

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)  # Disabling trial info messages

In [3]:
logger = logging.getLogger(__name__)

# Hyperparameter Optimization (HPO)

### Random Forest

In [4]:
for exp_type in ["combined", "gram-positive", "gram-negative", "fungi", "acid-fast"]:
    for fingerprint_name in ["ecfp4", "rdkit", "maccs", "chem_phys", "mhfp6", "erg"]:
        if os.path.exists(f"../models/{exp_type}/{fingerprint_name}_rf.pkl"):
            continue

        train_data = pd.read_csv(
            f"../data/splits/{exp_type}/{fingerprint_name}_smote_train.csv"
        )

        X_train, y_train = train_data.drop("label", axis=1), train_data["label"]

        # Start HPO process
        logger.warning(f"Starting HPO for {fingerprint_name}...")
        study = optuna.create_study(
            direction="maximize", study_name=f"{fingerprint_name}_rf"
        )
        study.optimize(
            lambda trial: objective_rf(
                trial,
                study_name=study.study_name,
                X_train=X_train,
                y_train=y_train,
                exp_type=exp_type,
            ),
            n_trials=15,
            show_progress_bar=True,
        )

        logger.warning("All trials saved in experiments folder.")

        # Train a new model using the best parameters
        best_model = RandomForestClassifier(random_state=SEED, **study.best_params)
        best_model.fit(X_train, y_train)

        os.makedirs(f"../models/{exp_type}", exist_ok=True)

        # Save the best model
        finalize_model(
            model=best_model,
            params=study.best_params,
            exp_name=study.study_name,
            model_path=f"../models/{exp_type}",
        )
    break

Starting HPO for ecfp4...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for rdkit...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for maccs...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for chem_phys...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for mhfp6...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for erg...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.


### XGBoost

In [5]:
for exp_type in ["combined", "gram-positive", "gram-negative", "fungi", "acid-fast"]:
    for fingerprint_name in ["ecfp4", "rdkit", "maccs", "chem_phys", "mhfp6", "erg"]:
        if os.path.exists(
            f"../models/{exp_type}/{fingerprint_name}_xgboost.pickle.dat"
        ):
            continue

        train_data = pd.read_csv(
            f"../data/splits/{exp_type}/{fingerprint_name}_smote_train.csv"
        )

        X_train, y_train = train_data.drop("label", axis=1), train_data["label"]

        # Start HPO process
        logger.warning(f"Starting HPO for {fingerprint_name}...")
        study = optuna.create_study(
            direction="maximize", study_name=f"{fingerprint_name}_xgboost"
        )

        study.optimize(
            lambda trial: objective_xgboost(
                trial,
                study_name=study.study_name,
                X_train=X_train,
                y_train=y_train,
                label_to_idx=label_to_idx,
                exp_type=exp_type,
            ),
            n_trials=15,
            show_progress_bar=True,
        )

        logger.warning("All trials saved in experiments folder.")

        # Train a new model using the best parameters
        y_train = y_train.map(label_to_idx)

        # Save the best model
        best_model = xgb.XGBClassifier(**study.best_params, seed=SEED)
        best_model.fit(X_train, y_train)

        model_path = f"../models/{exp_type}"

        os.makedirs(model_path, exist_ok=True)
        pickle.dump(
            best_model, open(f"{model_path}/{study.study_name}.pickle.dat", "wb")
        )

        # Save params
        with open(f"{model_path}/{study.study_name}_params.json", "w") as f:
            json.dump(
                study.best_params, f, indent=4, sort_keys=True, ensure_ascii=False
            )

    break

Starting HPO for ecfp4...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for rdkit...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for maccs...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for chem_phys...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for mhfp6...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for erg...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
