# **Ensemble Learning - Weighted Average of RF + LR + SVM**
Goal: Combine predictions from best baseline models to improve performance

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from scipy import optimize
import matplotlib.pyplot as plt
from datetime import datetime

## **LOAD DATA**

In [None]:
data = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_multi_data.csv")
pheno = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_pheno.csv", index_col=0)
X = data.drop('prename', axis=1).values

In [None]:
EXPERIMENT_ID = f"EXP-008-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
print(f"Experiment: {EXPERIMENT_ID}")

Experiment: EXP-008-20251107_065220


## **BASELINE RESULTS**

In [None]:
BASELINE_RESULTS = {
    'CIP': {'RF': 0.951, 'LR': 0.920, 'SVM': 0.911},
    'CTX': {'RF': 0.840, 'LR': 0.783, 'SVM': 0.770},
    'CTZ': {'RF': 0.845, 'LR': 0.787, 'SVM': 0.759},
    'GEN': {'RF': 0.766, 'LR': 0.700, 'SVM': 0.668}
}

all_results = []

for ab in ['CIP', 'CTX', 'CTZ', 'GEN']:
    print(f"\n{'='*60}")
    print(f"ENSEMBLE FOR {ab}")
    print(f"{'='*60}")

    y = pheno[ab].values

    #models
    rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    lr = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
    svm = SVC(kernel='linear', probability=True, random_state=42)

    models = {'RF': rf, 'LR': lr, 'SVM': svm}

    #cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    #strategy 1: simple average
    simple_avg_aucs = []

    #strategy 2: optimized weights
    optimized_avg_aucs = []
    best_weights = None

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        #train all models
        predictions = {}
        for name, model in models.items():
            model.fit(X_train, y_train)
            predictions[name] = model.predict_proba(X_val)[:, 1]

        #simple average ensemble
        ensemble_simple = (predictions['RF'] + predictions['LR'] + predictions['SVM']) / 3
        auc_simple = roc_auc_score(y_val, ensemble_simple)
        simple_avg_aucs.append(auc_simple)

        #optimized weights (only on fold 1 to save time)
        if fold == 1:
            def objective(weights):
                w_rf, w_lr, w_svm = weights
                ensemble = w_rf * predictions['RF'] + w_lr * predictions['LR'] + w_svm * predictions['SVM']
                return -roc_auc_score(y_val, ensemble)  #negative for minimization

            #constraints: weights sum to 1, all non-negative
            constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
            bounds = [(0, 1), (0, 1), (0, 1)]
            initial = [0.33, 0.33, 0.34]

            result = optimize.minimize(objective, initial, bounds=bounds, constraints=constraints)
            best_weights = result.x
            print(f"  Fold {fold} optimal weights: RF={best_weights[0]:.3f}, LR={best_weights[1]:.3f}, SVM={best_weights[2]:.3f}")

        #apply optimized weights
        ensemble_opt = (best_weights[0] * predictions['RF'] +
                       best_weights[1] * predictions['LR'] +
                       best_weights[2] * predictions['SVM'])
        auc_opt = roc_auc_score(y_val, ensemble_opt)
        optimized_avg_aucs.append(auc_opt)

        print(f"  Fold {fold}: Simple={auc_simple:.4f}, Optimized={auc_opt:.4f}")

    #results
    mean_simple = np.mean(simple_avg_aucs)
    std_simple = np.std(simple_avg_aucs)
    mean_opt = np.mean(optimized_avg_aucs)
    std_opt = np.std(optimized_avg_aucs)

    best_baseline = BASELINE_RESULTS[ab]['RF']  #RF is usually best
    improvement_simple = mean_simple - best_baseline
    improvement_opt = mean_opt - best_baseline

    print(f"\nRESULTS FOR {ab}:")
    print(f"  Best baseline (RF):     {best_baseline:.4f}")
    print(f"  Simple average:         {mean_simple:.4f}±{std_simple:.4f} (Δ={improvement_simple:+.4f})")
    print(f"  Optimized weights:      {mean_opt:.4f}±{std_opt:.4f} (Δ={improvement_opt:+.4f})")
    print(f"  Optimal weights: RF={best_weights[0]:.3f}, LR={best_weights[1]:.3f}, SVM={best_weights[2]:.3f}")

    all_results.append({
        'Experiment_ID': EXPERIMENT_ID,
        'Antibiotic': ab,
        'RF_Baseline': best_baseline,
        'Ensemble_Simple_Mean': round(mean_simple, 4),
        'Ensemble_Simple_Std': round(std_simple, 4),
        'Ensemble_Opt_Mean': round(mean_opt, 4),
        'Ensemble_Opt_Std': round(std_opt, 4),
        'Improvement_Simple': round(improvement_simple, 4),
        'Improvement_Opt': round(improvement_opt, 4),
        'Weight_RF': round(best_weights[0], 3),
        'Weight_LR': round(best_weights[1], 3),
        'Weight_SVM': round(best_weights[2], 3),
        'Best_Strategy': 'Optimized' if improvement_opt > improvement_simple else 'Simple'
    })


ENSEMBLE FOR CIP
  Fold 1 optimal weights: RF=0.330, LR=0.330, SVM=0.340
  Fold 1: Simple=0.8875, Optimized=0.8879
  Fold 2: Simple=0.9611, Optimized=0.9615
  Fold 3: Simple=0.9805, Optimized=0.9805
  Fold 4: Simple=0.9355, Optimized=0.9352
  Fold 5: Simple=0.9608, Optimized=0.9605

RESULTS FOR CIP:
  Best baseline (RF):     0.9510
  Simple average:         0.9451±0.0321 (Δ=-0.0059)
  Optimized weights:      0.9451±0.0320 (Δ=-0.0059)
  Optimal weights: RF=0.330, LR=0.330, SVM=0.340

ENSEMBLE FOR CTX


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 1 optimal weights: RF=0.330, LR=0.330, SVM=0.340
  Fold 1: Simple=0.8426, Optimized=0.8427


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 2: Simple=0.8199, Optimized=0.8198


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 3: Simple=0.8302, Optimized=0.8302
  Fold 4: Simple=0.8477, Optimized=0.8475


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 5: Simple=0.7715, Optimized=0.7709

RESULTS FOR CTX:
  Best baseline (RF):     0.8400
  Simple average:         0.8224±0.0272 (Δ=-0.0176)
  Optimized weights:      0.8222±0.0274 (Δ=-0.0178)
  Optimal weights: RF=0.330, LR=0.330, SVM=0.340

ENSEMBLE FOR CTZ


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 1 optimal weights: RF=0.330, LR=0.330, SVM=0.340
  Fold 1: Simple=0.8802, Optimized=0.8795


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 2: Simple=0.8578, Optimized=0.8574


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 3: Simple=0.8112, Optimized=0.8100
  Fold 4: Simple=0.7924, Optimized=0.7920


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 5: Simple=0.7955, Optimized=0.7961

RESULTS FOR CTZ:
  Best baseline (RF):     0.8450
  Simple average:         0.8274±0.0353 (Δ=-0.0176)
  Optimized weights:      0.8270±0.0351 (Δ=-0.0180)
  Optimal weights: RF=0.330, LR=0.330, SVM=0.340

ENSEMBLE FOR GEN


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 1 optimal weights: RF=0.330, LR=0.330, SVM=0.340
  Fold 1: Simple=0.7782, Optimized=0.7779


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 2: Simple=0.7078, Optimized=0.7073


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 3: Simple=0.7006, Optimized=0.7001


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 4: Simple=0.7103, Optimized=0.7093


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 5: Simple=0.7930, Optimized=0.7930

RESULTS FOR GEN:
  Best baseline (RF):     0.7660
  Simple average:         0.7380±0.0393 (Δ=-0.0280)
  Optimized weights:      0.7375±0.0396 (Δ=-0.0285)
  Optimal weights: RF=0.330, LR=0.330, SVM=0.340


## **SAVE RESULTS**

In [None]:
results_df = pd.DataFrame(all_results)

print("ENSEMBLE LEARNING SUMMARY")

print(results_df[['Antibiotic', 'RF_Baseline', 'Ensemble_Opt_Mean',
                   'Improvement_Opt', 'Weight_RF', 'Weight_LR', 'Weight_SVM']].to_string(index=False))

results_df.to_csv(f"/content/drive/MyDrive/ML-iAMR_Recreation/05_evaluation/results/{EXPERIMENT_ID}_ensemble_results.csv", index=False)
print(f"\nResults saved to results/{EXPERIMENT_ID}_ensemble_results.csv")

ENSEMBLE LEARNING SUMMARY
Antibiotic  RF_Baseline  Ensemble_Opt_Mean  Improvement_Opt  Weight_RF  Weight_LR  Weight_SVM
       CIP        0.951             0.9451          -0.0059       0.33       0.33        0.34
       CTX        0.840             0.8222          -0.0178       0.33       0.33        0.34
       CTZ        0.845             0.8270          -0.0180       0.33       0.33        0.34
       GEN        0.766             0.7375          -0.0285       0.33       0.33        0.34

Results saved to results/EXP-008-20251107_065220_ensemble_results.csv


## **FINAL COMPARISON**

In [None]:
print("FINAL MODEL COMPARISON")

final_comparison = pd.DataFrame({
    'Antibiotic': ['CIP', 'CTX', 'CTZ', 'GEN'],
    'Paper_RF': [0.96, 0.81, 0.93, 0.95],
    'Our_RF_Baseline': [r['RF_Baseline'] for r in all_results],
    'Our_Ensemble': [r['Ensemble_Opt_Mean'] for r in all_results],
    'Gap_to_Paper': [0.96 - r['Ensemble_Opt_Mean'] for r in all_results[:1]] +
                     [0.81 - r['Ensemble_Opt_Mean'] for r in all_results[1:2]] +
                     [0.93 - r['Ensemble_Opt_Mean'] for r in all_results[2:3]] +
                     [0.95 - r['Ensemble_Opt_Mean'] for r in all_results[3:]]
})

print(final_comparison.to_string(index=False))

#success criteria
success = sum(final_comparison['Gap_to_Paper'].abs() < 0.05)
print(f"\nModels within 5% of paper: {success}/4")

if success >= 2:
    print("SUCCESS: Sufficient reproduction!")
else:
    print("LIMITED: Consider focusing on CIP/CTX only...")

FINAL MODEL COMPARISON
Antibiotic  Paper_RF  Our_RF_Baseline  Our_Ensemble  Gap_to_Paper
       CIP      0.96            0.951        0.9451        0.0149
       CTX      0.81            0.840        0.8222       -0.0122
       CTZ      0.93            0.845        0.8270        0.1030
       GEN      0.95            0.766        0.7375        0.2125

Models within 5% of paper: 2/4
SUCCESS: Sufficient reproduction for publication!
