# Hyperparameter optimization and model finalization

This notebook trains and optimizes the RF and XGBoost model for our dataset.

In [1]:
import os
import pickle
import json
import optuna
import pandas as pd
import logging

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from hpo_trainer import objective_rf, SEED, finalize_model, objective_xgboost

In [2]:
import warnings

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)  # Disabling trial info messages

In [3]:
logger = logging.getLogger(__name__)

# Hyperparameter Optimization (HPO)

### Random Forest

In [4]:
for fingerprint_name in ["ecfp4", "rdkit", "maccs", "chem_phys", "mhfp6", "erg"]:
    if os.path.exists(f"../models/{fingerprint_name}_rf.pkl"):
        continue

    train_data = pd.read_csv(f"../data/splits/{fingerprint_name}_smote_train.csv")

    X_train, y_train = train_data.drop("label", axis=1), train_data["label"]

    # Start HPO process
    logger.warning(f"Starting HPO for {fingerprint_name}...")
    study = optuna.create_study(
        direction="maximize", study_name=f"{fingerprint_name}_rf"
    )
    study.optimize(
        lambda trial: objective_rf(
            trial, study_name=study.study_name, X_train=X_train, y_train=y_train
        ),
        n_trials=15,
        show_progress_bar=True,
    )

    logger.warning("All trials saved in experiments folder.")

    # Train a new model using the best parameters
    best_model = RandomForestClassifier(random_state=SEED, **study.best_params)
    best_model.fit(X_train, y_train)

    # Save the best model
    finalize_model(
        model=best_model, params=study.best_params, exp_name=study.study_name
    )

Starting HPO for ecfp4...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for rdkit...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for maccs...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for chem_phys...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for mhfp6...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for erg...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.


### XGBoost

In [5]:
for fingerprint_name in ["ecfp4", "rdkit", "maccs", "chem_phys", "mhfp6", "erg"]:
    if os.path.exists(f"../models/{fingerprint_name}_xgboost.pickle.dat"):
        continue

    train_data = pd.read_csv(f"../data/splits/{fingerprint_name}_smote_train.csv")

    X_train, y_train = train_data.drop("label", axis=1), train_data["label"]

    # Start HPO process
    logger.warning(f"Starting HPO for {fingerprint_name}...")
    study = optuna.create_study(
        direction="maximize", study_name=f"{fingerprint_name}_xgboost"
    )

    label_to_idx = {
        "gram-negative": 0,
        "gram-positive": 1,
        "acid-fast": 2,
        "fungi": 3,
    }

    study.optimize(
        lambda trial: objective_xgboost(
            trial,
            study_name=study.study_name,
            X_train=X_train,
            y_train=y_train,
            label_to_idx=label_to_idx,
        ),
        n_trials=15,
        show_progress_bar=True,
    )

    logger.warning("All trials saved in experiments folder.")

    # Train a new model using the best parameters
    y_train = y_train.map(label_to_idx)

    # Save the best model
    best_model = xgb.XGBClassifier(**study.best_params, seed=SEED)
    best_model.fit(X_train, y_train)

    model_path = f"../models"

    os.makedirs(model_path, exist_ok=True)
    pickle.dump(best_model, open(f"{model_path}/{study.study_name}.pickle.dat", "wb"))

    # Save params
    with open(f"{model_path}/{study.study_name}_params.json", "w") as f:
        json.dump(study.best_params, f, indent=4, sort_keys=True, ensure_ascii=False)

Starting HPO for ecfp4...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for rdkit...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for maccs...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for chem_phys...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for mhfp6...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
Starting HPO for erg...


  0%|          | 0/15 [00:00<?, ?it/s]

All trials saved in experiments folder.
