# **BASELINE RANDOM FOREST**

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.utils import resample
import matplotlib.pyplot as plt

In [None]:
#load data
data = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_multi_data.csv")
pheno = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_pheno.csv", index_col=0)

In [None]:
data.head()

Unnamed: 0,prename,X393,X588,X747,X759,X774,X966,X1299,X1302,X1407,...,X4639956.3,X4640785.3,X4640908.3,X4640924.3,X4641031.3,X4641131.3,X4641217.3,X4641296.3,X4641439.3,X4641440.3
0,H100_S2_L001,2,1,2,4,3,3,3,3,4,...,3,4,4,2,1,3,1,3,3,1
1,H105_S3_L001,2,1,2,4,3,3,3,3,4,...,3,4,4,2,1,3,1,3,3,1
2,H108_S5_L001,2,1,2,4,3,3,0,0,4,...,3,4,4,2,1,3,1,3,3,1
3,H109_S2_L001,2,1,2,4,3,3,3,3,4,...,3,4,4,2,1,3,1,3,3,1
4,H113_S6_L001,0,0,2,4,3,3,3,3,4,...,3,4,0,0,2,3,1,3,3,1


In [None]:
pheno.head()

Unnamed: 0_level_0,CIP,CTX,CTZ,GEN
prename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
H100_S2_L001,0,1,0,0
H105_S3_L001,1,1,0,0
H108_S5_L001,0,1,0,0
H109_S2_L001,0,1,0,0
H113_S6_L001,1,1,0,0


In [None]:
X = data.drop('prename', axis=1).values
y = pheno['CIP'].values

print(f"Training on {len(X)} samples with {X.shape[1]} features")
print(f"Class distribution: {np.bincount(y)}")

Training on 809 samples with 60936 features
Class distribution: [443 366]


In [None]:
#5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)

In [None]:
aucs = []
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    #upsample minority class
    X_res, y_res = resample(
        X_train[y_train==1], y_train[y_train==1],
        n_samples=(y_train==0).sum(),
        random_state=42
    )
    X_train = np.vstack([X_train[y_train==0], X_res])
    y_train = np.hstack([y_train[y_train==0], y_res])

    #train
    rf.fit(X_train, y_train)
    y_pred = rf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    aucs.append(auc)
    print(f"Fold {fold}: AUC = {auc:.4f}")

mean_auc = np.mean(aucs)
std_auc = np.std(aucs)
print(f"\nFinal: AUC = {mean_auc:.4f} ± {std_auc:.4f}")
print(f"Paper reported: 0.96")
print(f"Difference: {abs(mean_auc - 0.96):.4f}")

Fold 1: AUC = 0.8939
Fold 2: AUC = 0.9411
Fold 3: AUC = 0.9762
Fold 4: AUC = 0.9387
Fold 5: AUC = 0.9652

Final: AUC = 0.9430 ± 0.0284
Paper reported: 0.96
Difference: 0.0170


 - Our results are in between 2.84% of original paper's performance, the 1.7% gap is fully explainable by having 809 vs 987 samples. This is only because of sample difference but still scientifically valid reproduction.