# **Training Tier 2 XGBoost Models**

**`Hypothesis:`** "*Can addition of mutation have any impact on model performance?*"
In short

*"Tests if mutations ADD signal beyond genes."*

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, average_precision_score, precision_score, recall_score, f1_score
import shap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
#load data
tier2_df = pd.read_csv('/content/drive/MyDrive/amr_features/tier2_amr_genes_plus_mutations.csv', index_col=0)
phenotypes = pd.read_csv('/content/drive/MyDrive/data/E.coli/phenotypic.csv', index_col=0)

#align samples
phenotypes.set_index('Isolate', inplace=True)
common_samples = tier2_df.index.intersection(phenotypes.index)
X = tier2_df.loc[common_samples]

In [None]:
#train for each drug
results = []

for drug in ['AMX', 'AMC', 'CIP']:
    print(f"\n{'='*80}")
    print(f"TIER 2 MODEL: {drug}")
    print(f"{'='*80}")

    #prepare labels
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0})
    y = y.dropna()
    X_drug = X.loc[y.index]

    print(f"Data prepared for {drug}. Total samples: {len(X_drug)}")
    print(f"Resistance counts (R=1, S/I=0): {y.value_counts()}")

    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_drug, y, test_size=0.2, random_state=42, stratify=y
    )

    #calculate Class Weights for Imbalance, Resistance (1) is the positive class.
    n_resistant = np.sum(y_train == 1)
    n_susceptible = np.sum(y_train == 0)

    if n_resistant > 0:
        scale_pos_weight = n_susceptible / n_resistant
    else:
        #fallback if no resistant samples are in the training set (rare but safe)
        scale_pos_weight = 1.0

    #train XGBoost Model
    print(f"\n{drug}: Training XGBoost model...")
    print(f" - Train Samples: {len(X_train)} (R={n_resistant}, S={n_susceptible})")
    print(f" - Test Samples: {len(X_test)}")
    print(f" - scale_pos_weight: {scale_pos_weight:.2f}")

    #train model
    model = XGBClassifier(
        max_depth=6,
        n_estimators=100,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,   #len(y[y==0])/len(y[y==1]),
        random_state=42,
        eval_metric='auc'

    )

    model.fit(X_train, y_train)

    #evaluate
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    auroc = roc_auc_score(y_test, y_pred_proba)
    auprc = average_precision_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred_proba.round())
    recall = recall_score(y_test, y_pred_proba.round())
    f1 = f1_score(y_test, y_pred_proba.round())

    print(f"\nPerformance:")
    print(f" Model trained. Test AUROC: {auroc:.4f}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")
    print(f"AUPRC: {auprc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['S', 'R']))

    #SHAP analysis
    print("\nComputing SHAP values...")
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    #get top 20 features
    feature_importance = pd.DataFrame({
        'feature': X_drug.columns,
        'shap_importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)

    print(f"\nTop 20 Features:")
    print(feature_importance.head(20).to_string(index=False))

    #save results
    results.append({
        'drug': drug,
        'tier': 'Tier2',
        'auroc': auroc,
        'auprc': auprc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'model_type': 'XGBoost',
        'model_params': model.get_params(),
        'n_features': len(X_drug.columns),
        'n_samples': len(X_drug)
    })

    #save model and SHAP
    model.save_model(f'/content/drive/MyDrive/models/tier2_mutations_genes_model_{drug}.json')
    feature_importance.to_csv(f'/content/drive/MyDrive/amr_features/tier2_mutations_genes_feature_importance_{drug}.csv', index=False)

    # Save SHAP plot
    shap.summary_plot(shap_values, X_test, show=False)
    plt.savefig(f'/content/drive/MyDrive/amr_features/tier2_mutations_genes_shap_summary_{drug}.png', dpi=300, bbox_inches='tight')
    plt.close()

#save all results
results_df = pd.DataFrame(results)
results_df.to_csv('/content/drive/MyDrive/amr_features/tier2_mutations_genes_results_summary.csv', index=False)
print(results_df)


TIER 2 MODEL: AMX
Data prepared for AMX. Total samples: 1089
Resistance counts (R=1, S/I=0): AMX
1    659
0    430
Name: count, dtype: int64

AMX: Training XGBoost model...
 - Train Samples: 871 (R=527, S=344)
 - Test Samples: 218
 - scale_pos_weight: 0.65

Performance:
 Model trained. Test AUROC: 0.9355
ROC-AUC: 0.936
AUPRC: 0.9682
Precision: 0.9826
Recall: 0.8561
F1 Score: 0.9150

Classification Report:
              precision    recall  f1-score   support

           S       0.82      0.98      0.89        86
           R       0.98      0.86      0.91       132

    accuracy                           0.90       218
   macro avg       0.90      0.92      0.90       218
weighted avg       0.92      0.90      0.90       218


Computing SHAP values...

Top 20 Features:
    feature  shap_importance
      TEM-4         2.069278
   blaTEM-1         0.654300
      OXA-1         0.329998
 ftsI_L192F         0.245078
       sul1         0.227007
  gyrA_V85F         0.199541
 ompC_T155P     

# **Train Tier 3 Models**
- AMR genes
- snp mutations
- roary pangenome
- phenotypic

In [None]:
tier2 = pd.read_csv('/content/drive/MyDrive/amr_features/tier2_amr_genes_plus_mutations.csv', index_col=0)
phenotypes = pd.read_csv('/content/drive/MyDrive/data/E.coli/phenotypic.csv', index_col=0)

#align samples
phenotypes.set_index('Isolate', inplace=True)
common_samples = tier2_df.index.intersection(phenotypes.index)

In [None]:
#train for each drug
results = []

for drug in ['AMX', 'AMC', 'CIP']:
    print(f"\n{'='*80}")
    print(f"TIER 2 MODEL: {drug}")
    print(f"{'='*80}")

    roary_tier3 = pd.read_csv(f'/content/drive/MyDrive/pangenome_features/roary_filtered_{drug}_top500.csv', index_col=0)
    tier3_combined = pd.concat([tier2, roary_tier3], axis=1, join='inner')
    print(f"\nTier 3 for {drug}: {tier3_combined.shape}")

    X = tier3_combined.loc[common_samples]
    #prepare labels
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0})
    y = y.dropna()
    X_drug = X.loc[y.index]

    print(f"Data prepared for {drug}. Total samples: {len(X_drug)}")
    print(f"Resistance counts (R=1, S/I=0): {y.value_counts()}")

    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_drug, y, test_size=0.2, random_state=42, stratify=y
    )

    #calculate Class Weights for Imbalance, Resistance (1) is the positive class.
    n_resistant = np.sum(y_train == 1)
    n_susceptible = np.sum(y_train == 0)

    if n_resistant > 0:
        scale_pos_weight = n_susceptible / n_resistant
    else:
        #fallback if no resistant samples are in the training set (rare but safe)
        scale_pos_weight = 1.0

    #train XGBoost Model
    print(f"\n{drug}: Training XGBoost model...")
    print(f" - Train Samples: {len(X_train)} (R={n_resistant}, S={n_susceptible})")
    print(f" - Test Samples: {len(X_test)}")
    print(f" - scale_pos_weight: {scale_pos_weight:.2f}")

    #train model
    model = XGBClassifier(
        max_depth=6,
        n_estimators=100,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,   #len(y[y==0])/len(y[y==1]),
        random_state=42,
        eval_metric='auc'
    )

    model.fit(X_train, y_train)

    #evaluate
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    auroc = roc_auc_score(y_test, y_pred_proba)
    auprc = average_precision_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred_proba.round())
    recall = recall_score(y_test, y_pred_proba.round())
    f1 = f1_score(y_test, y_pred_proba.round())

    print(f"\nPerformance:")
    print(f" Model trained. Test AUROC: {auroc:.4f}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")
    print(f"AUPRC: {auprc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['S', 'R']))

    #SHAP analysis
    print("\nComputing SHAP values...")
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    #get top 20 features
    feature_importance = pd.DataFrame({
        'feature': X_drug.columns,
        'shap_importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)

    print(f"\nTop 20 Features:")
    print(feature_importance.head(20).to_string(index=False))

    #save results
    results.append({
        'drug': drug,
        'tier': 'Tier2',
        'auroc': auroc,
        'auprc': auprc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'model_type': 'XGBoost',
        'model_params': model.get_params(),

        'n_features': len(X_drug.columns),
        'n_samples': len(X_drug)
    })

    #save model and SHAP
    model.save_model(f'/content/drive/MyDrive/models/tier3_mutations_roary_genes_roary_model_{drug}.json')
    feature_importance.to_csv(f'/content/drive/MyDrive/amr_features/tier3_mutations_roary_genes_feature_importance_{drug}.csv', index=False)

    # Save SHAP plot
    shap.summary_plot(shap_values, X_test, show=False)
    plt.savefig(f'/content/drive/MyDrive/amr_features/tier3_mutations_roary_genes_shap_summary_{drug}.png', dpi=300, bbox_inches='tight')
    plt.close()

#save all results
results_df = pd.DataFrame(results)
results_df.to_csv('/content/drive/MyDrive/amr_features/tier2_mutations_roary_genes_results_summary.csv', index=False)
print(results_df)


TIER 2 MODEL: AMX

Tier 3 for AMX: (1089, 1736)
Data prepared for AMX. Total samples: 1089
Resistance counts (R=1, S/I=0): AMX
1    659
0    430
Name: count, dtype: int64

AMX: Training XGBoost model...
 - Train Samples: 871 (R=527, S=344)
 - Test Samples: 218
 - scale_pos_weight: 0.65

Performance:
 Model trained. Test AUROC: 0.9345
ROC-AUC: 0.935
AUPRC: 0.9670
Precision: 0.9741
Recall: 0.8561
F1 Score: 0.9113

Classification Report:
              precision    recall  f1-score   support

           S       0.81      0.97      0.88        86
           R       0.97      0.86      0.91       132

    accuracy                           0.90       218
   macro avg       0.89      0.91      0.90       218
weighted avg       0.91      0.90      0.90       218


Computing SHAP values...

Top 20 Features:
    feature  shap_importance
      TEM-4         2.019426
   blaTEM-1         0.445004
      OXA-1         0.304583
       tnpR         0.291128
       ybeT         0.207582
blaTEM-1B_1    

### **Results:**
Antimicrobial resistance mechanisms are antibiotic-specific: genes dominate `Î²-lactam` resistance, while `chromosomal mutations dominate fluoroquinolone resistance` for CIP.