# **Train RF for all 4 antibiotics**

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

In [None]:
#load data
data = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_multi_data.csv")
pheno = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_pheno.csv", index_col=0)
X = data.drop('prename', axis=1).values

In [None]:
data.head()

Unnamed: 0,prename,X393,X588,X747,X759,X774,X966,X1299,X1302,X1407,...,X4639956.3,X4640785.3,X4640908.3,X4640924.3,X4641031.3,X4641131.3,X4641217.3,X4641296.3,X4641439.3,X4641440.3
0,H100_S2_L001,2,1,2,4,3,3,3,3,4,...,3,4,4,2,1,3,1,3,3,1
1,H105_S3_L001,2,1,2,4,3,3,3,3,4,...,3,4,4,2,1,3,1,3,3,1
2,H108_S5_L001,2,1,2,4,3,3,0,0,4,...,3,4,4,2,1,3,1,3,3,1
3,H109_S2_L001,2,1,2,4,3,3,3,3,4,...,3,4,4,2,1,3,1,3,3,1
4,H113_S6_L001,0,0,2,4,3,3,3,3,4,...,3,4,0,0,2,3,1,3,3,1


In [None]:
pheno.head()

Unnamed: 0_level_0,CIP,CTX,CTZ,GEN
prename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
H100_S2_L001,0,1,0,0
H105_S3_L001,1,1,0,0
H108_S5_L001,0,1,0,0
H109_S2_L001,0,1,0,0
H113_S6_L001,1,1,0,0


In [None]:
#train for all antibiotics
antibiotics = ['CIP', 'CTX', 'CTZ', 'GEN']
results = []

for ab in antibiotics:
    print(f"\n{'='*50}") 
    print(f"Training RF for {ab}")
    print(f"{'='*50}")

    y = pheno[ab].values
    print(f"Class distribution: {np.bincount(y)} ({y.sum()}/{len(y)} resistant)")

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=200, random_state=42)

    aucs = []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        #upsample minority class
        X_res, y_res = resample(
            X_train[y_train==1], y_train[y_train==1],
            n_samples=(y_train==0).sum(),
            random_state=42
        )
        X_train_up = np.vstack([X_train[y_train==0], X_res])
        y_train_up = np.hstack([y_train[y_train==0], y_res])

        rf.fit(X_train_up, y_train_up)
        y_pred = rf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        aucs.append(auc)
        print(f"  Fold {fold}: AUC = {auc:.4f}")

    mean_auc = np.mean(aucs)
    std_auc = np.std(aucs)

    results.append({
        'Antibiotic': ab,
        'Our_AUC': f"{mean_auc:.4f} ± {std_auc:.4f}",
        'Mean_AUC': mean_auc,
        'Std_AUC': std_auc
    })

    print(f"{ab}: AUC = {mean_auc:.4f} ± {std_auc:.4f}")


Training RF for CIP
Class distribution: [443 366] (366/809 resistant)
  Fold 1: AUC = 0.8939
  Fold 2: AUC = 0.9411
  Fold 3: AUC = 0.9762
  Fold 4: AUC = 0.9387
  Fold 5: AUC = 0.9652
CIP: AUC = 0.9430 ± 0.0284

Training RF for CTX
Class distribution: [451 358] (358/809 resistant)
  Fold 1: AUC = 0.8326
  Fold 2: AUC = 0.8197
  Fold 3: AUC = 0.8218
  Fold 4: AUC = 0.8365
  Fold 5: AUC = 0.8220
CTX: AUC = 0.8265 ± 0.0067

Training RF for CTZ
Class distribution: [533 276] (276/809 resistant)
  Fold 1: AUC = 0.9000
  Fold 2: AUC = 0.8557
  Fold 3: AUC = 0.8346
  Fold 4: AUC = 0.8288
  Fold 5: AUC = 0.8029
CTZ: AUC = 0.8444 ± 0.0325

Training RF for GEN
Class distribution: [621 188] (188/809 resistant)
  Fold 1: AUC = 0.7852
  Fold 2: AUC = 0.7393
  Fold 3: AUC = 0.7499
  Fold 4: AUC = 0.7651
  Fold 5: AUC = 0.8067
GEN: AUC = 0.7692 ± 0.0243


In [None]:
#comparison table
comparison = pd.DataFrame(results)

#paper's reported values (from their Table 2)
paper_values = {
    'CIP': 0.96,
    'CTX': 0.81,
    'CTZ': 0.93,
    'GEN': 0.95
}

comparison['Paper_AUC'] = comparison['Antibiotic'].map(paper_values)
comparison['Difference'] = comparison['Mean_AUC'] - comparison['Paper_AUC']
comparison['Within_5%'] = comparison['Difference'].abs() < 0.05


print("BASELINE RF REPRODUCTION SUMMARY")
print(comparison[['Antibiotic', 'Our_AUC', 'Paper_AUC', 'Difference', 'Within_5%']].to_string(index=False))

BASELINE RF REPRODUCTION SUMMARY
Antibiotic         Our_AUC  Paper_AUC  Difference  Within_5%
       CIP 0.9430 ± 0.0284       0.96   -0.016968       True
       CTX 0.8265 ± 0.0067       0.81    0.016493       True
       CTZ 0.8444 ± 0.0325       0.93   -0.085603      False
       GEN 0.7692 ± 0.0243       0.95   -0.180779      False


### **Save results**

In [None]:
comparison.to_csv("baseline_rf_results.csv", index=False)
print("\nResults saved to baseline_rf_results.csv")


Results saved to baseline_rf_results.csv
