# **Tier 3 Model (Tier 1A + Tier 1C + Novel Genes)**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import shap

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## **Load Tier 1A + Tier 1C Datasets**

In [None]:
tier1a = pd.read_csv('/content/drive/MyDrive/amr_features/tier1a_acquired_amr_genes_CORRECTED.csv', index_col=0)
tier1c = pd.read_csv('/content/drive/MyDrive/amr_features/tier1c_plasmid_replicons.csv', index_col=0)

#load phenotypes
phenotypes = pd.read_csv('/content/drive/MyDrive/data/E.coli/phenotypic.csv', index_col=0)
phenotypes.set_index('Isolate', inplace=True)

## **Standardize sample IDs**

In [None]:
def replace_last_underscore_with_hash(s):
    s_str = str(s)
    parts = s_str.rsplit('_', 1)
    return '#'.join(parts) if len(parts) > 1 else s_str

In [None]:
tier1a.index = tier1a.index.map(replace_last_underscore_with_hash)
tier1c.index = tier1c.index.map(replace_last_underscore_with_hash)

## **Unweigthed XGBoost Model**

In [None]:
#train Tier 1 models for each drug
for drug in ['AMX', 'AMC', 'CIP']:
    print("="*50)
    print(f"TIER 1 MODEL: {drug}")
    print("="*50)

    file_path = f'/content/drive/MyDrive/pangenome_features/roary_filtered_{drug}_top500_decorrelated_v2.csv'
    roary_amx = pd.read_csv(file_path, index_col=0)

    #combine all tiers
    tier3_features = pd.concat([tier1a, tier1c, roary_amx], axis=1, join='inner')

    print(f"Tier 3 features (AMX): {tier3_features.shape}")
    print(f"  Tier 1A: {len(tier1a.columns)}")
    print(f"  Tier 1C: {len(tier1c.columns)}")
    print(f"  Novel genes: {len(roary_amx.columns)}")

    #find common samples
    common_samples = tier3_features.index.intersection(phenotypes.index)

    X = tier3_features.loc[common_samples]

    #map phenotypic data to binary (1=Resistant, 0=Susceptible/Intermediate)
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0}).dropna()

    X = X.loc[y.index]

    print(f"Data prepared for {drug}. Total samples: {len(X)}")
    print(f"Resistance counts (R=1, S/I=0): {y.value_counts()}")

    ##split Data into Train and Test Sets, We use stratified split to ensure the ratio of Resistant (1) to Susceptible (0) is the same in both the training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,    #use 20% of data for testing
        random_state=42,  #for reproducibility
        stratify=y        #essential for imbalanced data like AMR
    )

    #train XGBoost
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    )

    model.fit(X_train, y_train)

    #evaluate
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auroc = roc_auc_score(y_test, y_pred_proba)
    auprc = average_precision_score(y_test, y_pred_proba)

    print(f"\nResults:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")
    print(f" Model trained. Test AUROC: {auroc:.4f}")
    print(f"AUPRC: {auprc:.4f}")

    # SHAP feature importance
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    # Get top 20 novel genes
    feature_importance = pd.DataFrame({
        'gene': X_train.columns,
        'shap_importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)

    top_20_novel = feature_importance.head(20)
    print(top_20_novel)

    #save model
    model.save_model(f'/content/drive/MyDrive/models/tier3_model_{drug}.json')

TIER 1 MODEL: AMX
Tier 3 features (AMX): (943, 958)
  Tier 1A: 409
  Tier 1C: 60
  Novel genes: 489
Data prepared for AMX. Total samples: 943
Resistance counts (R=1, S/I=0): AMX
1    612
0    331
Name: count, dtype: int64

Results:
              precision    recall  f1-score   support

           0       0.85      0.95      0.90        66
           1       0.97      0.91      0.94       123

    accuracy                           0.93       189
   macro avg       0.91      0.93      0.92       189
weighted avg       0.93      0.93      0.93       189

ROC-AUC: 0.955
 Model trained. Test AUROC: 0.9548
AUPRC: 0.9806
               gene  shap_importance
58            TEM-4         2.263967
204     blaTEM-1B_1         0.415923
469            tnpR         0.397967
39            OXA-1         0.318338
858     group_12614         0.228247
44             PmrE         0.202713
764            ybeT         0.170597
327        blaTEM-1         0.137785
55          SHV-102         0.137067
159  aa

## **Weighted Model**

In [None]:
#train Tier 1 models for each drug
for drug in ['AMX', 'AMC', 'CIP']:
    print("="*50)
    print(f"TIER 1 MODEL: {drug}")
    print("="*50)

    file_path = f'/content/drive/MyDrive/pangenome_features/roary_filtered_{drug}_top500_decorrelated_v2.csv'
    roary_amx = pd.read_csv(file_path, index_col=0)

    #combine all tiers
    tier3_features = pd.concat([tier1a, tier1c, roary_amx], axis=1, join='inner')

    print(f"Tier 3 features (AMX): {tier3_features.shape}")
    print(f"  Tier 1A: {len(tier1a.columns)}")
    print(f"  Tier 1C: {len(tier1c.columns)}")
    print(f"  Novel genes: {len(roary_amx.columns)}")

    #find common samples
    common_samples = tier3_features.index.intersection(phenotypes.index)

    X = tier3_features.loc[common_samples]

    #map phenotypic data to binary (1=Resistant, 0=Susceptible/Intermediate)
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0}).dropna()

    X = X.loc[y.index]

    print(f"Data prepared for {drug}. Total samples: {len(X)}")
    print(f"Resistance counts (R=1, S/I=0): {y.value_counts()}")

    ##split Data into Train and Test Sets, We use stratified split to ensure the ratio of Resistant (1) to Susceptible (0) is the same in both the training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,    #use 20% of data for testing
        random_state=42,  #for reproducibility
        stratify=y        #essential for imbalanced data like AMR
    )

    data_dict = {}


    #calculate Class Weights for Imbalance, Resistance (1) is the positive class.
    n_resistant = np.sum(y_train == 1)
    n_susceptible = np.sum(y_train == 0)

    if n_resistant > 0:
        scale_pos_weight = n_susceptible / n_resistant
    else:
        #fallback if no resistant samples are in the training set (rare but safe)
        scale_pos_weight = 1.0

    #train XGBoost Model
    print(f"\n{drug}: Training XGBoost model...")
    print(f" - Train Samples: {len(X_train)} (R={n_resistant}, S={n_susceptible})")
    print(f" - Test Samples: {len(X_test)}")
    print(f" - scale_pos_weight: {scale_pos_weight:.2f}")


    #train XGBoost

    model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
    )

    model.fit(X_train, y_train)

    #evaluate Model
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auroc = roc_auc_score(y_test, y_pred_proba)
    auprc = average_precision_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred_proba.round())
    recall = recall_score(y_test, y_pred_proba.round())
    f1 = f1_score(y_test, y_pred_proba.round())


    print(f" Model trained. Test AUROC: {auroc:.4f}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")
    print(f"AUPRC: {auprc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    #sore Results
    data_dict[drug] = {
        'model': model,
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'auroc': auroc
    }

    print(f"\nResults:")
    print(classification_report(y_test, y_pred))

    # SHAP feature importance
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    # Get top 20 novel genes
    feature_importance = pd.DataFrame({
        'gene': X_train.columns,
        'shap_importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)

    top_20_novel = feature_importance.head(20)
    print(top_20_novel)

    print(f"\n--- Training complete for {drug} ---")

    #save model
    model.save_model(f'/content/drive/MyDrive/models/tier3_weighted_model_{drug}.json')

TIER 1 MODEL: AMX
Tier 3 features (AMX): (943, 958)
  Tier 1A: 409
  Tier 1C: 60
  Novel genes: 489
Data prepared for AMX. Total samples: 943
Resistance counts (R=1, S/I=0): AMX
1    612
0    331
Name: count, dtype: int64

AMX: Training XGBoost model...
 - Train Samples: 754 (R=489, S=265)
 - Test Samples: 189
 - scale_pos_weight: 0.54


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Model trained. Test AUROC: 0.9555
ROC-AUC: 0.956
AUPRC: 0.9806
Precision: 0.9735
Recall: 0.8943
F1 Score: 0.9322

Results:
              precision    recall  f1-score   support

           0       0.83      0.95      0.89        66
           1       0.97      0.89      0.93       123

    accuracy                           0.92       189
   macro avg       0.90      0.92      0.91       189
weighted avg       0.92      0.92      0.92       189

               gene  shap_importance
58            TEM-4         1.620436
327        blaTEM-1         0.635433
469            tnpR         0.575004
204     blaTEM-1B_1         0.298469
39            OXA-1         0.217642
858     group_12614         0.172244
764            ybeT         0.166714
44             PmrE         0.156107
778            yfeA         0.122540
491     group_18191         0.122351
667      group_3820         0.115672
743      group_5999         0.110303
159  aac(6')Ib-cr_1         0.088247
625          yiaM_1         0.0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Model trained. Test AUROC: 0.8289
ROC-AUC: 0.829
AUPRC: 0.7194
Precision: 0.5781
Recall: 0.6167
F1 Score: 0.5968

Results:
              precision    recall  f1-score   support

           0       0.82      0.79      0.80       129
           1       0.58      0.62      0.60        60

    accuracy                           0.74       189
   macro avg       0.70      0.70      0.70       189
weighted avg       0.74      0.74      0.74       189

                          gene  shap_importance
58                       TEM-4         0.959371
318                   blaOXA-1         0.407161
120                       mdtM         0.198007
675                     insA_1         0.191861
469                       tnpR         0.163331
39                       OXA-1         0.157075
44                        PmrE         0.156493
416                   Col156_1         0.141250
134                       sul1         0.140465
204                blaTEM-1B_1         0.133576
476                 g

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Model trained. Test AUROC: 0.9739
ROC-AUC: 0.974
AUPRC: 0.9778
Precision: 0.9722
Recall: 0.9722
F1 Score: 0.9722

Results:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       153
           1       0.97      0.97      0.97        36

    accuracy                           0.99       189
   macro avg       0.98      0.98      0.98       189
weighted avg       0.99      0.99      0.99       189

                    gene  shap_importance
487           group_9126         0.654164
502                   rz         0.347538
429             IncFIA_1         0.339974
474               yihF_2         0.305462
572                 pemK         0.287020
471                 chpB         0.260242
947                 betU         0.249210
940                 yeeO         0.234203
738                 yedI         0.221822
599                 gatD         0.219948
588                 kfoC         0.211574
435  IncFII(29)_1_pUTI89         0.211004
915

### **Tier 3 Model Comparison (Unweighted vs. Weighted)**

| Drug | Model Version | AUROC | AUPRC | Precision (R=1) | Recall (R=1) | F1-Score (R=1) | Judgment |
| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
| **AMX** (Balanced, R:S $\approx$ 1.8:1) | **Unweighted** | **0.9548** | **0.9806** | **0.97** | **0.91** | **0.94** | **Marginally Better** (Higher Precision/F1) |
| **AMX** | Weighted (0.54) | 0.9555 | 0.9806 | 0.97 | 0.89 | 0.93 | Excellent performance. |
|---|---|---|---|---|---|---|---|
| **AMC** (Imbalanced, R:S $\approx$ 1:2.1) | Unweighted | **0.8345** | **0.7349** | **0.70** | 0.62 | 0.65 | **Worse** (Lower Recall) |
| **AMC** | **Weighted (2.13)** | 0.8289 | 0.7194 | 0.58 | **0.62** | **0.60** | **Better Trade-off** (Similar Recall, but more weight on minority class) |
|---|---|---|---|---|---|---|---|
| **CIP** (Highly Imbalanced, R:S $\approx$ 1:4.2) | Unweighted | **0.9739** | **0.9778** | **1.00** | 0.97 | **0.99** | **Worse** (Risk of overfitting minority class) |
| **CIP** | **Weighted (4.24)** | 0.9739 | 0.9778 | 0.97 | **0.97** | **0.97** | **Best Choice** (More balanced high performance) |

***

**`Conclusion and Analysis`**

**`1. Amoxicillin (AMX)`**
Both models are fantastic. The **Unweighted model** is slightly better, maximizing the F1-score (0.94) and overall metrics. Since the resistant class is the majority, class weighting is less critical, and the unweighted approach performs optimally.

**`2. Amoxicillin/Clavulanate (AMC)`**
This is the only drug where performance is noticeably lower than the others.
* The **Weighted model** has an F1-score of **0.60** (Precision 0.58, Recall 0.62).
* The Unweighted model has an F1-score of **0.65** (Precision 0.70, Recall 0.62).

In this case, the **Unweighted model** actually performed better across all key metrics (AUROC, AUPRC, F1), achieving a higher precision without sacrificing Recall (which remained 0.62 in both models). **This suggests the information needed to reliably separate resistant from susceptible isolates is still limited for AMC, and the weighting did not help recover more signal.** Therefore, the **Unweighted model** is preferred for AMC.

**`3. Ciprofloxacin (CIP)`**
Both models are near-perfect, with the unweighted model showing $1.00$ precision for the resistant class. This indicates the $\text{Tier 1A} + \text{Tier 1C} + \text{Novel Genes}$ features are highly effective.
* The unweighted model's $\text{Precision} = 1.00$ is potentially a sign of **overfitting** or finding an overly simple rule in the test set.
* The **Weighted model** is more robust, showing a slight dip in precision ($\text{0.97}$) but maintaining excellent recall ($\text{0.97}$) and an F1-score of $0.97$. This balanced high performance makes the **Weighted model** the more reliable choice.

**`Overall Recommendation`**

The choice depends on priority:

* **For Maximum Prediction Accuracy (AUROC/F1):** Use the **Unweighted model** for **AMX and AMC**, and the **Weighted model** for **CIP**.
* **For Consistency and Balanced Risk:** If we must pick one approach across all drugs, the **Unweighted model** is technically superior here for two out of three drugs (AMX and AMC), which is unusual but supported by the metrics.

I recommend using the **unweighted models** for **AMX and AMC** and the **weighted model** for **CIP**.