# **Hyperparameter Tuning for CTZ & GEN**
Goal: Improve underperforming antibiotics through optimized RF parameters
Using Optuna for Bayesian optimization

In [None]:
!pip install optuna --q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m266.2/400.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
import optuna
from datetime import datetime
import json

## **LOAD DATA**

In [None]:
#load data
data = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_multi_data.csv")
pheno = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_pheno.csv", index_col=0)
X = data.drop('prename', axis=1).values

In [None]:
EXPERIMENT_ID = f"EXP-006-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
print(f"Experiment: {EXPERIMENT_ID}")
print(f"Hyperparameter Tuning for CTZ & GEN")

Experiment: EXP-006-20251106_172439
Hyperparameter Tuning for CTZ & GEN


## **BASELINE RESULTS**

In [None]:
BASELINE_RESULTS = {
    'CTZ': 0.8453,  #from our results
    'GEN': 0.7656
}

## **OPTUNA OBJECTIVE**

In [None]:
# def objective(trial, X, y, antibiotic):
#     """Optuna optimization objective"""

#     #hyperparameter search space
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#         'max_depth': trial.suggest_int('max_depth', 10, 50),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
#         'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
#         'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
#         'random_state': 42,
#         'n_jobs': -1
#     }

#     #cross-validation
#     rf = RandomForestClassifier(**params)
#     cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

#     scores = cross_val_score(rf, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
#     return scores.mean()

In [None]:
def objective(trial, X, y, antibiotic):
    """Optuna optimization objective"""

    #hyperparameter search space
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 150, 300),  #narrower + faster
    'max_depth': trial.suggest_int('max_depth', 12, 25),          #RF rarely needs >25 here
    'min_samples_split': trial.suggest_int('min_samples_split', 3, 12),
    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 5),  #avoid leaf=1 (very expensive + overfit)
    'max_features': trial.suggest_categorical('max_features', ['sqrt', None]),
    'class_weight': trial.suggest_categorical('class_weight', ['balanced_subsample']),
    'random_state': 42,
    'n_jobs': -1
}

    #cross-validation
    rf = RandomForestClassifier(**params)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    scores = cross_val_score(rf, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

## **OPTIMIZE EACH ANTIBIOTIC**

In [None]:
results = []

for ab in ['CTZ', 'GEN']:
    print(f"\n{'='*60}")
    print(f"Optimizing RF for {ab}")
    print(f"Baseline AUC: {BASELINE_RESULTS[ab]:.4f}")
    print(f"{'='*60}\n")

    y = pheno[ab].values

    #run Optuna optimization
    study = optuna.create_study(
        direction='maximize',
        sampler=optuna.samplers.TPESampler(seed=42)
    )

    study.optimize(
        lambda trial: objective(trial, X, y, ab),
        n_trials=10,  #reduced for Colab (increase to 100 if time permits)
        show_progress_bar=True
    )

    #best parameters
    best_params = study.best_params
    best_auc = study.best_value
    improvement = best_auc - BASELINE_RESULTS[ab]

    print(f"\nOptimization complete for {ab}")
    print(f"  Best AUC: {best_auc:.4f} (Δ = {improvement:+.4f})")
    print(f"  Best parameters: {best_params}")

    #save results
    result = {
        'Experiment_ID': EXPERIMENT_ID,
        'Antibiotic': ab,
        'Baseline_AUC': BASELINE_RESULTS[ab],
        'Optimized_AUC': round(best_auc, 4),
        'Improvement': round(improvement, 4),
        'Status': 'Improved' if improvement > 0.01 else 'Marginal',
        'Best_Params': json.dumps(best_params)
    }
    results.append(result)

[I 2025-11-06 17:24:39,963] A new study created in memory with name: no-name-9a14f2fa-0e65-4f83-8d61-6d61d37255b3



Optimizing RF for CTZ
Baseline AUC: 0.8453



  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-11-06 17:24:48,179] Trial 0 finished with value: 0.8377178789041334 and parameters: {'n_estimators': 206, 'max_depth': 25, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.8377178789041334.
[I 2025-11-06 17:37:47,897] Trial 1 finished with value: 0.8407878025533669 and parameters: {'n_estimators': 158, 'max_depth': 24, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': None, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.8407878025533669.
[I 2025-11-06 18:04:56,187] Trial 2 finished with value: 0.8430305890665409 and parameters: {'n_estimators': 275, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': None, 'class_weight': 'balanced_subsample'}. Best is trial 2 with value: 0.8430305890665409.
[I 2025-11-06 18:23:11,672] Trial 3 finished with value: 0.8413154223212143 and parameters: {'n_estimators': 215, 'max_depth': 16, 'min_

[I 2025-11-06 19:10:06,000] A new study created in memory with name: no-name-2c0b04ff-d0fe-4dec-a3f4-a75e32094dbc


[I 2025-11-06 19:10:05,964] Trial 9 finished with value: 0.8423779068537209 and parameters: {'n_estimators': 240, 'max_depth': 24, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': None, 'class_weight': 'balanced_subsample'}. Best is trial 2 with value: 0.8430305890665409.

Optimization complete for CTZ
  Best AUC: 0.8430 (Δ = -0.0023)
  Best parameters: {'n_estimators': 275, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': None, 'class_weight': 'balanced_subsample'}

Optimizing RF for GEN
Baseline AUC: 0.7656



  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-11-06 19:10:14,688] Trial 0 finished with value: 0.7798254965934567 and parameters: {'n_estimators': 206, 'max_depth': 25, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.7798254965934567.
[I 2025-11-06 19:20:37,606] Trial 1 finished with value: 0.7776130780260436 and parameters: {'n_estimators': 158, 'max_depth': 24, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': None, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.7798254965934567.
[I 2025-11-06 19:40:30,592] Trial 2 finished with value: 0.7838364199592188 and parameters: {'n_estimators': 275, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': None, 'class_weight': 'balanced_subsample'}. Best is trial 2 with value: 0.7838364199592188.
[I 2025-11-06 19:56:04,525] Trial 3 finished with value: 0.7799401061828988 and parameters: {'n_estimators': 215, 'max_depth': 16, 'min_

## **EVALUATE OPTIMIZED MODELS**

In [None]:
print("EVALUATING OPTIMIZED MODELS WITH 5-FOLD CV")

optimized_results = []

for ab in ['CTZ', 'GEN']:
    y = pheno[ab].values

    #get best parameters from results
    best_params_dict = next(r for r in results if r['Antibiotic'] == ab)
    best_params = json.loads(best_params_dict['Best_Params'])

    #train with best parameters
    rf_optimized = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)     # n_splits=5
    fold_aucs = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        rf_optimized.fit(X_train, y_train)
        y_pred = rf_optimized.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        fold_aucs.append(auc)
        print(f"  {ab} Fold {fold}: AUC = {auc:.4f}")

    mean_auc = np.mean(fold_aucs)
    std_auc = np.std(fold_aucs)

    optimized_results.append({
        'Antibiotic': ab,
        'Optimized_AUC_Mean': round(mean_auc, 4),
        'Optimized_AUC_Std': round(std_auc, 4),
        'Baseline_AUC': BASELINE_RESULTS[ab],
        'Improvement': round(mean_auc - BASELINE_RESULTS[ab], 4)
    })

    print(f"{ab}: {mean_auc:.4f}±{std_auc:.4f} (Δ = {mean_auc - BASELINE_RESULTS[ab]:+.4f})\n")   # google is always good if u also want such symbols xD

EVALUATING OPTIMIZED MODELS WITH 5-FOLD CV
  CTZ Fold 1: AUC = 0.8637
  CTZ Fold 2: AUC = 0.8457
  CTZ Fold 3: AUC = 0.8196
CTZ: 0.8430±0.0181 (Δ = -0.0023)

  GEN Fold 1: AUC = 0.7716
  GEN Fold 2: AUC = 0.7633
  GEN Fold 3: AUC = 0.8166
GEN: 0.7838±0.0234 (Δ = +0.0182)



## **SAVE RESULTS**

In [None]:
results_df = pd.DataFrame(results)
optimized_df = pd.DataFrame(optimized_results)

print("HYPERPARAMETER TUNING SUMMARY")
print(optimized_df.to_string(index=False))

#save to CSV
results_df.to_csv(f"/content/drive/MyDrive/ML-iAMR_Recreation/05_evaluation/results/{EXPERIMENT_ID}_tuning_params.csv", index=False)
optimized_df.to_csv(f"/content/drive/MyDrive/ML-iAMR_Recreation/05_evaluation/results/{EXPERIMENT_ID}_tuning_results.csv", index=False)

print(f"\nResults saved to results/{EXPERIMENT_ID}_*.csv")

## **COMPARISON TABLE**

In [None]:
comparison = pd.DataFrame({
    'Antibiotic': ['CTZ', 'GEN'],
    'Paper_AUC': [0.93, 0.95],
    'Our_Baseline': [BASELINE_RESULTS['CTZ'], BASELINE_RESULTS['GEN']],
    'Our_Optimized': [r['Optimized_AUC_Mean'] for r in optimized_results],
    'Gap_to_Paper': [0.93 - r['Optimized_AUC_Mean'] for r in optimized_results]
})

print("COMPARISON TO PAPER")
print(comparison.to_string(index=False))

COMPARISON TO PAPER
Antibiotic  Paper_AUC  Our_Baseline  Our_Optimized  Gap_to_Paper
       CTZ       0.93        0.8453         0.8430        0.0870
       GEN       0.95        0.7656         0.7838        0.1462
