# **`Tier 2 Model (Tier 1A + Tier 1C (Plasmids))`**

## **`3. Tier 1C: Plasmid Replicons (60 features)`**
**`File:`** `tier1c_plasmid_replicons.csv`

**`Composition:`**
- Plasmid incompatibility groups: `IncFII`, `IncFIA`, `IncQ1`, etc.

**`Use for:`**
- Tier 2 models (plasmid-based transmission risk)
- Epidemiological analysis: Tracking horizontal gene transfer
- Feature engineering: Create "plasmid burden" composite features

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import shap

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## **Load Tier 1A and Tier 1C**

In [None]:
tier1a = pd.read_csv('/content/drive/MyDrive/amr_features/tier1a_acquired_amr_genes_CORRECTED.csv', index_col=0)
tier1c = pd.read_csv('/content/drive/MyDrive/amr_features/tier1c_plasmid_replicons.csv', index_col=0)

In [None]:
#load phenotypes
phenotypes = pd.read_csv('/content/drive/MyDrive/data/E.coli/phenotypic.csv', index_col=0)
phenotypes.set_index('Isolate', inplace=True)

## **Standardize IDs**

In [None]:
def replace_last_underscore_with_hash(s):
    s_str = str(s)
    parts = s_str.rsplit('_', 1)
    return '#'.join(parts) if len(parts) > 1 else s_str

In [None]:
tier1a.index = tier1a.index.map(replace_last_underscore_with_hash)
tier1c.index = tier1c.index.map(replace_last_underscore_with_hash)

#combine Tier 1A + Tier 1C
# tier2_features = pd.concat([tier1a, tier1c], axis=1, join='inner')  #An inner join keeps only the index labels that are present in both DataFrames
tier2_features = pd.concat([tier1a, tier1c], axis=1, join='outer')  #preserve every single piece of data from both tier1a and tier1c—even if a row index exists in one table but not the other

print(f"Tier 2 features: {tier2_features.shape}")
print(f"  Tier 1A: {len(tier1a.columns)}")
print(f"  Tier 1C: {len(tier1c.columns)}")

Tier 2 features: (1651, 469)
  Tier 1A: 409
  Tier 1C: 60


In [None]:
#in outer join any cells where a match doesn't exist are filled with NaN so fill those with 0, as
tier2_features.fillna(0, inplace=True)    #in biological data like this, the lack of detection equals zero presence.
tier2_features.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
pSL483_1,0
AAC(3)-IIa,0
AAC(3)-IV,0
AAC(3)-VIa,0
AAC(6')-IIc,0
...,...
APH(4)-Ia,0
APH(3')-Ia,0
APH(3')-IIa,0
APH(3'')-Ib,0


## **Unweigthed XGBoost Model**

In [None]:
#train Tier 1 models for each drug
for drug in ['AMX', 'AMC', 'CIP']:
    print("="*50)
    print(f"TIER 1 MODEL: {drug}")
    print("="*50)

    #find common samples
    common_samples = tier2_features.index.intersection(phenotypes.index)

    X = tier2_features.loc[common_samples]

    #map phenotypic data to binary (1=Resistant, 0=Susceptible/Intermediate)
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0}).dropna()

    X = X.loc[y.index]

    print(f"Data prepared for {drug}. Total samples: {len(X)}")
    print(f"Resistance counts (R=1, S/I=0): {y.value_counts()}")

    ##split Data into Train and Test Sets, We use stratified split to ensure the ratio of Resistant (1) to Susceptible (0) is the same in both the training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,    #use 20% of data for testing
        random_state=42,  #for reproducibility
        stratify=y        #essential for imbalanced data like AMR
    )

    #train XGBoost
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    )

    model.fit(X_train, y_train)

    #evaluate
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auroc = roc_auc_score(y_test, y_pred_proba)
    auprc = average_precision_score(y_test, y_pred_proba)

    print(f"\nResults:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")
    print(f" Model trained. Test AUROC: {auroc:.4f}")
    print(f"AUPRC: {auprc:.4f}")

    # SHAP feature importance
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    # Get top 20 novel genes
    feature_importance = pd.DataFrame({
        'gene': X_train.columns,
        'shap_importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)

    top_20_novel = feature_importance.head(20)
    print(top_20_novel)

    #save model
    model.save_model(f'/content/drive/MyDrive/models/tier2_model_{drug}.json')

TIER 1 MODEL: AMX
Data prepared for AMX. Total samples: 1089
Resistance counts (R=1, S/I=0): AMX
1.0    659
0.0    430
Name: count, dtype: int64

Results:
              precision    recall  f1-score   support

         0.0       0.81      0.99      0.89        86
         1.0       0.99      0.85      0.91       132

    accuracy                           0.90       218
   macro avg       0.90      0.92      0.90       218
weighted avg       0.92      0.90      0.90       218

ROC-AUC: 0.935
 Model trained. Test AUROC: 0.9349
AUPRC: 0.9675
                   gene  shap_importance
58                TEM-4         2.135012
327            blaTEM-1         0.642499
134                sul1         0.284389
311               blaEC         0.282555
39                OXA-1         0.250291
430  IncFIB(AP001918)_1         0.234646
44                 PmrE         0.210759
204         blaTEM-1B_1         0.177517
413        Col(MG828)_1         0.154935
343             catB3.1         0.120090
457

## **Weighted Model**

In [None]:
#train Tier 1 models for each drug
for drug in ['AMX', 'AMC', 'CIP']:
    print("="*50)
    print(f"TIER 1 MODEL: {drug}")
    print("="*50)

    #find common samples
    common_samples = tier2_features.index.intersection(phenotypes.index)

    X = tier2_features.loc[common_samples]

    #map phenotypic data to binary (1=Resistant, 0=Susceptible/Intermediate)
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0}).dropna()

    X = X.loc[y.index]

    print(f"Data prepared for {drug}. Total samples: {len(X)}")
    print(f"Resistance counts (R=1, S/I=0): {y.value_counts()}")

    ##split Data into Train and Test Sets, We use stratified split to ensure the ratio of Resistant (1) to Susceptible (0) is the same in both the training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,    #use 20% of data for testing
        random_state=42,  #for reproducibility
        stratify=y        #essential for imbalanced data like AMR
    )

    data_dict = {}


    #calculate Class Weights for Imbalance, Resistance (1) is the positive class.
    n_resistant = np.sum(y_train == 1)
    n_susceptible = np.sum(y_train == 0)

    if n_resistant > 0:
        scale_pos_weight = n_susceptible / n_resistant
    else:
        #fallback if no resistant samples are in the training set (rare but safe)
        scale_pos_weight = 1.0

    #train XGBoost Model
    print(f"\n{drug}: Training XGBoost model...")
    print(f" - Train Samples: {len(X_train)} (R={n_resistant}, S={n_susceptible})")
    print(f" - Test Samples: {len(X_test)}")
    print(f" - scale_pos_weight: {scale_pos_weight:.2f}")


    #train XGBoost

    model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
    )

    model.fit(X_train, y_train)

    #evaluate Model
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auroc = roc_auc_score(y_test, y_pred_proba)
    auprc = average_precision_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred_proba.round())
    recall = recall_score(y_test, y_pred_proba.round())
    f1 = f1_score(y_test, y_pred_proba.round())


    print(f" Model trained. Test AUROC: {auroc:.4f}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")
    print(f"AUPRC: {auprc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    #sore Results
    data_dict[drug] = {
        'model': model,
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'auroc': auroc
    }

    print(f"\nResults:")
    print(classification_report(y_test, y_pred))

    # SHAP feature importance
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    # Get top 20 novel genes
    feature_importance = pd.DataFrame({
        'gene': X_train.columns,
        'shap_importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)

    top_20_novel = feature_importance.head(20)
    print(top_20_novel)

    print(f"\n--- Training complete for {drug} ---")

    #save model
    model.save_model(f'/content/drive/MyDrive/models/tier2_weighted_model_{drug}.json')

TIER 1 MODEL: AMX
Data prepared for AMX. Total samples: 1089
Resistance counts (R=1, S/I=0): AMX
1.0    659
0.0    430
Name: count, dtype: int64

AMX: Training XGBoost model...
 - Train Samples: 871 (R=527, S=344)
 - Test Samples: 218
 - scale_pos_weight: 0.65


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Model trained. Test AUROC: 0.9341
ROC-AUC: 0.934
AUPRC: 0.9677
Precision: 0.9825
Recall: 0.8485
F1 Score: 0.9106

Results:
              precision    recall  f1-score   support

         0.0       0.81      0.98      0.88        86
         1.0       0.98      0.85      0.91       132

    accuracy                           0.90       218
   macro avg       0.90      0.91      0.90       218
weighted avg       0.91      0.90      0.90       218

                   gene  shap_importance
58                TEM-4         1.774221
327            blaTEM-1         0.790325
204         blaTEM-1B_1         0.328809
134                sul1         0.258852
430  IncFIB(AP001918)_1         0.241199
311               blaEC         0.227895
44                 PmrE         0.227461
39                OXA-1         0.225019
55              SHV-102         0.117893
457             IncQ1_1         0.100792
120                mdtM         0.099535
413        Col(MG828)_1         0.097294
429            I

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Model trained. Test AUROC: 0.8161
ROC-AUC: 0.816
AUPRC: 0.7130
Precision: 0.5748
Recall: 0.7300
F1 Score: 0.6432

Results:
              precision    recall  f1-score   support

         0.0       0.87      0.77      0.81       230
         1.0       0.57      0.73      0.64       100

    accuracy                           0.75       330
   macro avg       0.72      0.75      0.73       330
weighted avg       0.78      0.75      0.76       330

                    gene  shap_importance
58                 TEM-4         1.273356
318             blaOXA-1         0.390970
39                 OXA-1         0.228516
435  IncFII(29)_1_pUTI89         0.143219
401               sul1.1         0.109406
413         Col(MG828)_1         0.109127
382               mdtM.1         0.108122
457              IncQ1_1         0.105942
429             IncFIA_1         0.101902
416             Col156_1         0.099869
258             tet(A)_4         0.096272
327             blaTEM-1         0.094055
134

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Model trained. Test AUROC: 0.9464
ROC-AUC: 0.946
AUPRC: 0.9108
Precision: 0.8219
Recall: 0.8333
F1 Score: 0.8276

Results:
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95       258
         1.0       0.82      0.83      0.83        72

    accuracy                           0.92       330
   macro avg       0.89      0.89      0.89       330
weighted avg       0.92      0.92      0.92       330

                    gene  shap_importance
429             IncFIA_1         1.079219
120                 mdtM         0.583947
430   IncFIB(AP001918)_1         0.306619
416             Col156_1         0.228104
126                 mphA         0.227464
413         Col(MG828)_1         0.217326
205          blaTEM-1C_5         0.199248
435  IncFII(29)_1_pUTI89         0.174440
248               strA_4         0.172102
16              CTX-M-15         0.158666
134                 sul1         0.140887
311                blaEC         0.133088
343

## **`Tier 2 Model Comparison (Unweighted vs. Weighted)`**

The **Weighted Models** generally performed better for the imbalanced classes (AMC and CIP) by improving **Recall** (identifying resistant samples) and **F1-Score**.

| Drug | Model Version | AUROC | AUPRC | Precision (R=1) | Recall (R=1) | F1-Score (R=1) | Judgment |
| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
| **AMX** (Balanced, R:S $\approx$ 1.5:1) | **Unweighted** | **0.9349** | **0.9675** | 0.99 | 0.85 | **0.91** | **Marginally Better** (Higher AUROC/Precision) |
| **AMX** | Weighted (0.65) | 0.9341 | 0.9677 | 0.98 | 0.85 | **0.91** | Excellent performance, very close to Unweighted. |
|---|---|---|---|---|---|---|---|
| **AMC** (Imbalanced, R:S $\approx$ 1:2.3) | Unweighted | **0.8171** | **0.7133** | **0.68** | 0.52 | 0.59 | **Worse** (Low Recall for Resistance) |
| **AMC** | **Weighted (2.29)** | 0.8161 | 0.7130 | 0.57 | **0.73** | **0.64** | **Better** (Significantly higher Recall and F1-Score) |
|---|---|---|---|---|---|---|---|
| **CIP** (Highly Imbalanced, R:S $\approx$ 1:3.6) | Unweighted | 0.9514 | 0.9138 | **0.92** | 0.78 | 0.84 | **Worse** (Lower Recall for Resistance) |
| **CIP** | **Weighted (3.62)** | **0.9464** | **0.9108** | 0.82 | **0.83** | **0.83** | **Better** (Higher Recall and F1-Score) |


**`Conclusion on Model Selection`**

The **weighted models** are superior for the practical goal of **AMR prediction**, particularly for **AMC and CIP**.

1.  **AMC:** The weighted model boosts **Recall** for the resistant class from **0.52 to 0.73**, a substantial gain in correctly identifying resistant isolates, which is the priority in diagnostics. This trade-off leads to a higher F1-Score (0.64 vs 0.59).
2.  **CIP:** The weighted model increases **Recall** from **0.78 to 0.83**, resulting in a robust **F1-Score of 0.83**. While the unweighted model achieved a slightly higher AUROC, its high precision came at the cost of missing more resistant samples.

For consistency and to maximize the ability to detect the minority (Resistant) class, the **Weighted Models** are the recommended choice for all three drugs moving forward.


# **`4. Population Structure Markers (69 features)`**
**`File:`** `population_structure_markers.csv`

**`Composition:`**
- **Chromosomal SNPs**: `gyrA_S83L`, `parC_E84V`, `ptsI_V25I`, etc.
- **Regulatory mutations**: `marR_S3N`, `acrR_R45C`, etc.

**`Use for:`**
- Population structure analysis (`phylogenetic context`)
- Clonal complex identification (e.g., `ST131 markers`)
- Confounding variable control in statistical models
- NOT for correlation filtering (`already separated`)

**`Important Note:`**
- Some of these ARE resistance mutations (e.g., `gyrA_S83L` for `fluoroquinolones`), but they're chromosomal and clonally inherited, not acquired via HGT (`horizontal gene transfer`) like Tier 1A genes.

In [3]:
population = pd.read_csv("./population_structure_markers.csv")

In [6]:
population.shape

(1168, 70)

In [4]:
population.head(20)

Unnamed: 0,ISOLATE_ID,acrR_R45C,ampC_C-11T,ampC_C-42T,ampC_T-14TGT,ampC_T-32A,blaTEMp_C141G,blaTEMp_C32T,blaTEMp_G162T,cirA_Q42STOP,...,parE_S458T,pmrB_L10P,pmrB_P94Q,pmrB_V161G,ptsI_V25I,rpoB_Q148L,rpoB_V146F,soxR_G121D,soxR_R20H,uhpT_E350Q
0,11657_5_1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,11657_5_11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,11657_5_13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,11657_5_14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11657_5_15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,11657_5_16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,11657_5_17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,11657_5_18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,11657_5_2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,11657_5_20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
population.columns

Index(['ISOLATE_ID', 'acrR_R45C', 'ampC_C-11T', 'ampC_C-42T', 'ampC_T-14TGT',
       'ampC_T-32A', 'blaTEMp_C141G', 'blaTEMp_C32T', 'blaTEMp_G162T',
       'cirA_Q42STOP', 'cyaA_S352T', 'fabI_F203L', 'ftsI_G363S',
       'ftsI_I336IKYRI', 'ftsI_N337NYRIN', 'gyrA_D87G', 'gyrA_D87N',
       'gyrA_D87Y', 'gyrA_G81D', 'gyrA_S83A', 'gyrA_S83L', 'gyrA_S83V',
       'marR_S3N', 'nfsA_E223STOP', 'nfsA_E75STOP', 'nfsA_G126R', 'nfsA_G131D',
       'nfsA_G154E', 'nfsA_H11Y', 'nfsA_Q113STOP', 'nfsA_Q44STOP',
       'nfsA_Q67STOP', 'nfsA_R133S', 'nfsA_R15C', 'nfsA_R203C',
       'nfsA_W159STOP', 'nfsB_W94STOP', 'ompC_Q171STOP', 'ompC_Q82STOP',
       'ompC_R195L', 'ompF_Q88STOP', 'parC_A108T', 'parC_A108V', 'parC_A56T',
       'parC_E84G', 'parC_E84K', 'parC_E84V', 'parC_G78D', 'parC_S57T',
       'parC_S80I', 'parC_S80R', 'parE_D475E', 'parE_E460D', 'parE_E460K',
       'parE_I355T', 'parE_I464F', 'parE_I529L', 'parE_L416F', 'parE_L445H',
       'parE_S458A', 'parE_S458T', 'pmrB_L10P', 'pmrB_P94Q'