# **All Models (RF, LR, SVM)**

Reproduces paper's Table 2 results with label-encoded SNP data

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef, classification_report, confusion_matrix
from datetime import datetime
import json

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#CONFIG
EXPERIMENT_ID = f"EXP-BASELINE-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
RANDOM_STATE = 42
N_FOLDS = 5

In [4]:
#paper's reported AUROCs (Table 2)
PAPER_RESULTS = {
    'CIP': {'RF': 0.96, 'LR': 0.94, 'SVM': 0.93},
    'CTX': {'RF': 0.81, 'LR': 0.86, 'SVM': 0.85},
    'CTZ': {'RF': 0.93, 'LR': 0.90, 'SVM': 0.88},
    'GEN': {'RF': 0.95, 'LR': 0.88, 'SVM': 0.87}
}

## **LOAD DATA**

In [5]:
print(f"Starting Experiment: {EXPERIMENT_ID}\n")

data = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_multi_data.csv")
pheno = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_pheno.csv", index_col=0)

##Drop prename column
X = data.drop('prename', axis=1).values
print(f"Loaded data: {X.shape[0]} samples × {X.shape[1]} features\n")

Starting Experiment: EXP-BASELINE-20260118_203057

Loaded data: 809 samples × 60936 features



## **MODELS**

In [6]:
models = {
    'RF': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
    'LR': LogisticRegression(solver='lbfgs', max_iter=1000, random_state=RANDOM_STATE),
    'SVM': SVC(kernel='linear', probability=True, random_state=RANDOM_STATE)
}

## **TRAINING LOOP**

In [7]:
all_results = []
antibiotics = ['CIP', 'CTX', 'CTZ', 'GEN']

for ab in antibiotics:
    print(f"\n{'='*60}")
    print(f"ANTIBIOTIC: {ab}")
    print(f"{'='*60}")

    y = pheno[ab].values
    pos = y.sum()
    neg = len(y) - pos
    print(f"Class distribution: {pos} Resistant / {neg} Susceptible ({pos/len(y)*100:.1f}% resistant)")

    for model_name, model in models.items():
        print(f"\n--- Training {model_name} ---")

        cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
        fold_metrics = []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            #train model
            model.fit(X_train, y_train)

            #predict
            y_pred_proba = model.predict_proba(X_val)[:, 1]
            y_pred_class = model.predict(X_val)

            #calculate metrics
            auc = roc_auc_score(y_val, y_pred_proba)
            mcc = matthews_corrcoef(y_val, y_pred_class)

            fold_metrics.append({'fold': fold, 'auc': auc, 'mcc': mcc})
            print(f"  Fold {fold}: AUC={auc:.4f}, MCC={mcc:.4f}")

        #aggregate metrics
        aucs = [m['auc'] for m in fold_metrics]
        mccs = [m['mcc'] for m in fold_metrics]
        mean_auc = np.mean(aucs)
        std_auc = np.std(aucs)
        mean_mcc = np.mean(mccs)

        #compare to paper
        paper_auc = PAPER_RESULTS[ab][model_name]
        diff = mean_auc - paper_auc
        within_threshold = abs(diff) < 0.05

        result = {
            'Experiment_ID': EXPERIMENT_ID,
            'Antibiotic': ab,
            'Model': model_name,
            'Our_AUC_Mean': round(mean_auc, 4),
            'Our_AUC_Std': round(std_auc, 4),
            'Our_MCC_Mean': round(mean_mcc, 4),
            'Paper_AUC': paper_auc,
            'Difference': round(diff, 4),
            'Within_5pct': within_threshold,
            'Status': 'Reproduced' if within_threshold else 'Below target'
        }
        all_results.append(result)

        print(f"{model_name}: AUC = {mean_auc:.4f}±{std_auc:.4f} (Paper: {paper_auc}) [{result['Status']}]")


ANTIBIOTIC: CIP
Class distribution: 366 Resistant / 443 Susceptible (45.2% resistant)

--- Training RF ---
  Fold 1: AUC=0.9066, MCC=0.7520
  Fold 2: AUC=0.9637, MCC=0.8758
  Fold 3: AUC=0.9765, MCC=0.8522
  Fold 4: AUC=0.9388, MCC=0.7884
  Fold 5: AUC=0.9696, MCC=0.8514
RF: AUC = 0.9510±0.0256 (Paper: 0.96) [Reproduced]

--- Training LR ---
  Fold 1: AUC=0.8719, MCC=0.6761
  Fold 2: AUC=0.9512, MCC=0.8630
  Fold 3: AUC=0.9529, MCC=0.8064
  Fold 4: AUC=0.9080, MCC=0.7023
  Fold 5: AUC=0.9201, MCC=0.8080
LR: AUC = 0.9208±0.0300 (Paper: 0.94) [Reproduced]

--- Training SVM ---
  Fold 1: AUC=0.8565, MCC=0.6010
  Fold 2: AUC=0.9483, MCC=0.8379
  Fold 3: AUC=0.9357, MCC=0.7734
  Fold 4: AUC=0.8927, MCC=0.7264
  Fold 5: AUC=0.9201, MCC=0.7831
SVM: AUC = 0.9107±0.0328 (Paper: 0.93) [Reproduced]

ANTIBIOTIC: CTX
Class distribution: 358 Resistant / 451 Susceptible (44.3% resistant)

--- Training RF ---
  Fold 1: AUC=0.8480, MCC=0.6120
  Fold 2: AUC=0.8358, MCC=0.5604
  Fold 3: AUC=0.8354, MCC=

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 1: AUC=0.8049, MCC=0.4500


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 2: AUC=0.7946, MCC=0.5217


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 3: AUC=0.7928, MCC=0.4849


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 4: AUC=0.8123, MCC=0.5382


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 5: AUC=0.7067, MCC=0.3655
LR: AUC = 0.7823±0.0384 (Paper: 0.86) [Below target]

--- Training SVM ---
  Fold 1: AUC=0.7927, MCC=0.4606
  Fold 2: AUC=0.7846, MCC=0.4831
  Fold 3: AUC=0.7701, MCC=0.4235
  Fold 4: AUC=0.8090, MCC=0.5241
  Fold 5: AUC=0.6948, MCC=0.3517
SVM: AUC = 0.7703±0.0398 (Paper: 0.85) [Below target]

ANTIBIOTIC: CTZ
Class distribution: 276 Resistant / 533 Susceptible (34.1% resistant)

--- Training RF ---
  Fold 1: AUC=0.9093, MCC=0.6948
  Fold 2: AUC=0.8744, MCC=0.6493
  Fold 3: AUC=0.8334, MCC=0.5369
  Fold 4: AUC=0.8069, MCC=0.4930
  Fold 5: AUC=0.8025, MCC=0.4753
RF: AUC = 0.8453±0.0410 (Paper: 0.93) [Below target]

--- Training LR ---


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 1: AUC=0.8333, MCC=0.5933
  Fold 2: AUC=0.8090, MCC=0.5029
  Fold 3: AUC=0.7353, MCC=0.4332
  Fold 4: AUC=0.7846, MCC=0.4589


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 5: AUC=0.7635, MCC=0.3553
LR: AUC = 0.7851±0.0342 (Paper: 0.9) [Below target]

--- Training SVM ---
  Fold 1: AUC=0.7766, MCC=0.4706
  Fold 2: AUC=0.7873, MCC=0.5082
  Fold 3: AUC=0.7497, MCC=0.4434
  Fold 4: AUC=0.7316, MCC=0.4479
  Fold 5: AUC=0.7496, MCC=0.4051
SVM: AUC = 0.7589±0.0202 (Paper: 0.88) [Below target]

ANTIBIOTIC: GEN
Class distribution: 188 Resistant / 621 Susceptible (23.2% resistant)

--- Training RF ---
  Fold 1: AUC=0.7923, MCC=0.2861
  Fold 2: AUC=0.7435, MCC=0.2747
  Fold 3: AUC=0.7521, MCC=0.4170
  Fold 4: AUC=0.7304, MCC=0.3502
  Fold 5: AUC=0.8095, MCC=0.3586
RF: AUC = 0.7656±0.0302 (Paper: 0.95) [Below target]

--- Training LR ---


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 1: AUC=0.7496, MCC=0.2298


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 2: AUC=0.6902, MCC=0.2747


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 3: AUC=0.6135, MCC=0.2194


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 4: AUC=0.7133, MCC=0.1711


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 5: AUC=0.7266, MCC=0.2572
LR: AUC = 0.6986±0.0467 (Paper: 0.88) [Below target]

--- Training SVM ---
  Fold 1: AUC=0.7124, MCC=0.3015
  Fold 2: AUC=0.6237, MCC=0.2266
  Fold 3: AUC=0.6280, MCC=0.2474
  Fold 4: AUC=0.6384, MCC=0.1648
  Fold 5: AUC=0.7359, MCC=0.2572
SVM: AUC = 0.6677±0.0470 (Paper: 0.87) [Below target]


## **SAVE RESULTS**

In [8]:
results_df = pd.DataFrame(all_results)

#summary table
print("\n" + "="*80)
print("BASELINE REPRODUCTION SUMMARY")
print("="*80)
print(results_df[['Antibiotic', 'Model', 'Our_AUC_Mean', 'Paper_AUC', 'Difference', 'Status']].to_string(index=False))


BASELINE REPRODUCTION SUMMARY
Antibiotic Model  Our_AUC_Mean  Paper_AUC  Difference       Status
       CIP    RF        0.9510       0.96     -0.0090   Reproduced
       CIP    LR        0.9208       0.94     -0.0192   Reproduced
       CIP   SVM        0.9107       0.93     -0.0193   Reproduced
       CTX    RF        0.8395       0.81      0.0295   Reproduced
       CTX    LR        0.7823       0.86     -0.0777 Below target
       CTX   SVM        0.7703       0.85     -0.0797 Below target
       CTZ    RF        0.8453       0.93     -0.0847 Below target
       CTZ    LR        0.7851       0.90     -0.1149 Below target
       CTZ   SVM        0.7589       0.88     -0.1211 Below target
       GEN    RF        0.7656       0.95     -0.1844 Below target
       GEN    LR        0.6986       0.88     -0.1814 Below target
       GEN   SVM        0.6677       0.87     -0.2023 Below target


In [9]:
#save to CSV
output_file = f"/content/drive/MyDrive/ML-iAMR_Recreation/05_evaluation/results/{EXPERIMENT_ID}_baseline_results.csv"
results_df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")

#save experiment log
log_entry = {
    'experiment_id': EXPERIMENT_ID,
    'timestamp': datetime.now().isoformat(),
    'dataset': 'Giessen (label-encoded)',
    'samples': X.shape[0],
    'features': X.shape[1],
    'models': list(models.keys()),
    'antibiotics': antibiotics,
    'n_folds': N_FOLDS,
    'results_summary': {
        'total_experiments': len(all_results),
        'reproduced': sum(1 for r in all_results if r['Within_5pct']),
        'below_target': sum(1 for r in all_results if not r['Within_5pct'])
    }
}

with open(f"/content/drive/MyDrive/ML-iAMR_Recreation/05_evaluation/results/{EXPERIMENT_ID}_log.json", 'w') as f:
    json.dump(log_entry, f, indent=2)

print(f"\nExperiment log saved")
print(f"\nReproduction rate: {log_entry['results_summary']['reproduced']}/{log_entry['results_summary']['total_experiments']} experiments within 5% of paper")


Results saved to /content/drive/MyDrive/ML-iAMR_Recreation/05_evaluation/results/EXP-BASELINE-20260118_203057_baseline_results.csv

Experiment log saved

Reproduction rate: 4/12 experiments within 5% of paper


# **EXP-005: Class Imbalance Handling for CTZ & GEN**

**`Hypothesis:`** GEN/CTZ underperformance due to severe class imbalance (23%/34% resistant)
Test SMOTE, class weights, and threshold optimization

In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from collections import Counter

## **LOAD DATA**

In [None]:
data = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_multi_data.csv")
pheno = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_pheno.csv", index_col=0)
X = data.drop('prename', axis=1).values

In [11]:
#EXPERIMENT CONFIG
ANTIBIOTICS = ['CTZ', 'GEN']  # Focus on problematic ones
STRATEGIES = {
    'baseline': {'use_smote': False, 'class_weight': None},
    'class_weight': {'use_smote': False, 'class_weight': 'balanced'},
    'smote': {'use_smote': True, 'class_weight': None},
    'smote_balanced': {'use_smote': True, 'class_weight': 'balanced'}
}

In [12]:
results = []

for ab in ANTIBIOTICS:
    print(f"\n{'='*60}")
    print(f"TESTING: {ab}")
    print(f"{'='*60}")

    y = pheno[ab].values
    print(f"Original distribution: {Counter(y)}")
    print(f"Imbalance ratio: {y.sum()}/{len(y)} = {y.sum()/len(y):.2%} resistant")

    for strategy_name, config in STRATEGIES.items():
        print(f"\n--- Strategy: {strategy_name} ---")

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        fold_aucs = []
        fold_f1s = []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            #apply SMOTE if configured
            if config['use_smote']:
                smote = SMOTE(random_state=42, k_neighbors=3)
                X_train, y_train = smote.fit_resample(X_train, y_train)
                if fold == 1:  #print once
                    print(f"  After SMOTE: {Counter(y_train)}")

            #train RF
            rf = RandomForestClassifier(
                n_estimators=200,
                class_weight=config['class_weight'],
                random_state=42,
                n_jobs=-1
            )
            rf.fit(X_train, y_train)

            #evaluate
            y_pred_proba = rf.predict_proba(X_val)[:, 1]
            y_pred_class = rf.predict(X_val)

            auc = roc_auc_score(y_val, y_pred_proba)
            f1 = f1_score(y_val, y_pred_class)

            fold_aucs.append(auc)
            fold_f1s.append(f1)

        mean_auc = np.mean(fold_aucs)
        std_auc = np.std(fold_aucs)
        mean_f1 = np.mean(fold_f1s)

        #compare to baseline
        if strategy_name == 'baseline':
            baseline_auc = mean_auc
            improvement = 0
        else:
            improvement = mean_auc - baseline_auc

        result = {
            'Antibiotic': ab,
            'Strategy': strategy_name,
            'AUC_Mean': round(mean_auc, 4),
            'AUC_Std': round(std_auc, 4),
            'F1_Mean': round(mean_f1, 4),
            'Improvement_vs_Baseline': round(improvement, 4),
            'Status': 'Succeeded' if improvement > 0.01 else ('Acceptable' if improvement > 0 else 'Failed')
        }
        results.append(result)

        print(f"  AUC: {mean_auc:.4f}±{std_auc:.4f} | F1: {mean_f1:.4f} | Δ: {improvement:+.4f} {result['Status']}")


TESTING: CTZ
Original distribution: Counter({np.int64(0): 533, np.int64(1): 276})
Imbalance ratio: 276/809 = 34.12% resistant

--- Strategy: baseline ---
  AUC: 0.8453±0.0410 | F1: 0.7123 | Δ: +0.0000 Failed

--- Strategy: class_weight ---
  AUC: 0.8441±0.0432 | F1: 0.7156 | Δ: -0.0012 Failed

--- Strategy: smote ---
  After SMOTE: Counter({np.int64(0): 426, np.int64(1): 426})
  AUC: 0.8432±0.0404 | F1: 0.7195 | Δ: -0.0021 Failed

--- Strategy: smote_balanced ---
  After SMOTE: Counter({np.int64(0): 426, np.int64(1): 426})
  AUC: 0.8432±0.0404 | F1: 0.7195 | Δ: -0.0021 Failed

TESTING: GEN
Original distribution: Counter({np.int64(0): 621, np.int64(1): 188})
Imbalance ratio: 188/809 = 23.24% resistant

--- Strategy: baseline ---
  AUC: 0.7656±0.0302 | F1: 0.4702 | Δ: +0.0000 Failed

--- Strategy: class_weight ---
  AUC: 0.7771±0.0269 | F1: 0.4533 | Δ: +0.0115 Succeeded

--- Strategy: smote ---
  After SMOTE: Counter({np.int64(0): 496, np.int64(1): 496})
  AUC: 0.7638±0.0284 | F1: 0.496

## **ANALYSIS**

In [13]:
results_df = pd.DataFrame(results)

print("\n" + "="*80)
print("CLASS IMBALANCE HANDLING RESULTS")
print("="*80)
print(results_df.to_string(index=False))

#find best strategy per antibiotic
print("\n" + "="*80)
print("BEST STRATEGIES")
print("="*80)
for ab in ANTIBIOTICS:
    ab_results = results_df[results_df['Antibiotic'] == ab]
    best = ab_results.loc[ab_results['AUC_Mean'].idxmax()]
    print(f"{ab}: {best['Strategy']} → AUC={best['AUC_Mean']:.4f} (Δ={best['Improvement_vs_Baseline']:+.4f})")

# Save results
results_df.to_csv("/content/drive/MyDrive/ML-iAMR_Recreation/05_evaluation/results/EXP-005_imbalance_results.csv", index=False)
print("\nResults saved to results/EXP-005_imbalance_results.csv")


CLASS IMBALANCE HANDLING RESULTS
Antibiotic       Strategy  AUC_Mean  AUC_Std  F1_Mean  Improvement_vs_Baseline    Status
       CTZ       baseline    0.8453   0.0410   0.7123                   0.0000    Failed
       CTZ   class_weight    0.8441   0.0432   0.7156                  -0.0012    Failed
       CTZ          smote    0.8432   0.0404   0.7195                  -0.0021    Failed
       CTZ smote_balanced    0.8432   0.0404   0.7195                  -0.0021    Failed
       GEN       baseline    0.7656   0.0302   0.4702                   0.0000    Failed
       GEN   class_weight    0.7771   0.0269   0.4533                   0.0115 Succeeded
       GEN          smote    0.7638   0.0284   0.4963                  -0.0018    Failed
       GEN smote_balanced    0.7638   0.0284   0.4963                  -0.0018    Failed

BEST STRATEGIES
CTZ: baseline → AUC=0.8453 (Δ=+0.0000)
GEN: class_weight → AUC=0.7771 (Δ=+0.0115)

Results saved to results/EXP-005_imbalance_results.csv


In [14]:
#RECOMMENDATIONS
print("\n" + "="*80)
print("RECOMMENDATIONS")
print("="*80)

for ab in ANTIBIOTICS:
    ab_results = results_df[results_df['Antibiotic'] == ab]
    best_strategy = ab_results.loc[ab_results['AUC_Mean'].idxmax(), 'Strategy']
    improvement = ab_results.loc[ab_results['AUC_Mean'].idxmax(), 'Improvement_vs_Baseline']

    if improvement > 0.02:
        print(f" {ab}: Use '{best_strategy}' strategy (significant improvement: +{improvement:.4f})")
    elif improvement > 0.01:
        print(f" {ab}: Consider '{best_strategy}' (marginal improvement: +{improvement:.4f})")
    else:
        print(f" {ab}: Class imbalance NOT the primary issue. Investigate feature quality/hyperparameters.")


RECOMMENDATIONS
 CTZ: Class imbalance NOT the primary issue. Investigate feature quality/hyperparameters.
 GEN: Consider 'class_weight' (marginal improvement: +0.0115)
