# **Novel Genes-Only Model Without Lineage Marker Filtering**


**`Author:`** AMR Prediction Project
**`Date:`** December 2025

## **SETUP AND IMPORTS**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pickle
import warnings
import json
import requests
from io import StringIO
warnings.filterwarnings('ignore')

#machine learning
from xgboost import XGBClassifier
from sklearn.model_selection import (train_test_split, StratifiedKFold,
                                      cross_val_score)
from sklearn.metrics import (roc_auc_score, roc_curve, auc,
                              precision_recall_curve, average_precision_score,
                              classification_report, confusion_matrix,
                              precision_score, recall_score, f1_score)

#statistical tests
from scipy import stats
from scipy.stats import pearsonr, spearmanr, chi2_contingency, mannwhitneyu
import shap

#set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11

In [None]:
#mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#set paths
BASE_DIR = Path('/content/drive/MyDrive')
DATA_DIR = BASE_DIR / 'amr_features'
ROARY_DIR = BASE_DIR / 'pangenome_features'
RESULTS_DIR = BASE_DIR / 'results' / 'additional_experiments'
MODEL_DIR = BASE_DIR / 'models'

RESULTS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
print(f"Results will be saved to: {RESULTS_DIR}")

Results will be saved to: /content/drive/MyDrive/results/additional_experiments


## **Helper function for ID standardization**

In [None]:
def standardize_sample_id(sample_id):
    sample_id = str(sample_id).strip()
    if '#' in sample_id:
        return sample_id
    parts = sample_id.rsplit('_', 1)
    return f"{parts[0]}#{parts[1]}" if len(parts) == 2 else sample_id

def fix_sample_ids(df):
    df.index = df.index.map(standardize_sample_id)
    return df

## **NOVEL GENES-ONLY MODEL**
**`Purpose:`** Test if novel genes alone can predict resistance

In [None]:
def train_novel_genes_only_model(drug='AMX'):
    """
    Train model using ONLY novel genes (no Tier 2)
    Tests independent predictive power of novel genes
    """
    print(f"\n{'='*60}")
    print(f"Training Novel Genes-Only Model for {drug}")
    print(f"{'='*60}")

    #load filtered novel genes
    roary_file = ROARY_DIR / f'roary_filtered_{drug}_top500_decorrelated_v2.csv'

    if not roary_file.exists():
        print(f"File not found: {roary_file}")
        return None

    roary_df = pd.read_csv(roary_file, index_col=0)
    roary_df = fix_sample_ids(roary_df)

    #load phenotypes
    phenotypes = pd.read_csv(BASE_DIR / 'data/E.coli/phenotypic.csv')
    if 'Isolate' in phenotypes.columns:
        phenotypes = phenotypes.set_index('Isolate')
    elif 'Lane.accession' in phenotypes.columns:
        phenotypes = phenotypes.set_index('Lane.accession')
    phenotypes = fix_sample_ids(phenotypes)

    #align samples
    common_samples = roary_df.index.intersection(phenotypes.index)
    X = roary_df.loc[common_samples]
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0}).dropna()
    X = X.loc[y.index]

    print(f"Data prepared:")
    print(f"  Samples: {len(X)}")
    print(f"  Features (novel genes only): {X.shape[1]}")
    print(f"  Resistant: {(y==1).sum()} ({(y==1).sum()/len(y)*100:.1f}%)")
    print(f"  Susceptible: {(y==0).sum()} ({(y==0).sum()/len(y)*100:.1f}%)")

    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    #calculate class weight
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    print(f"\nTraining configuration:")
    print(f"  Train: {len(X_train)} samples")
    print(f"  Test: {len(X_test)} samples")
    print(f"  scale_pos_weight: {scale_pos_weight:.2f}")

    #train model
    model = XGBClassifier(
        max_depth=5,
        n_estimators=100,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        min_child_weight=3,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        eval_metric='auc',
        verbosity=0
    )

    model.fit(X_train, y_train)

    #evaluate
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)

    auroc = roc_auc_score(y_test, y_pred_proba)
    auprc = average_precision_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    #cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)


    print(f"PERFORMANCE (NOVEL GENES ONLY)")
    print(f"AUROC:     {auroc:.4f}")
    print(f"AUPRC:     {auprc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"\n5-Fold CV: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['S', 'R'],
                                zero_division=0))

    #SHAP analysis
    print(f"\nComputing SHAP values for top features...")
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'shap_importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)

    print(f"\nTop 10 Novel Genes:")
    print(feature_importance.head(10).to_string(index=False))

    #save results
    results = {
        'drug': drug,
        'model_type': 'Novel_Genes_Only',
        'n_features': X.shape[1],
        'n_samples': len(X),
        'auroc': auroc,
        'auprc': auprc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }

    #save predictions for statistical testing
    np.save(RESULTS_DIR / f'novel_only_{drug}_predictions.npy', y_pred_proba)
    np.save(RESULTS_DIR / f'novel_only_{drug}_y_test.npy', y_test.values)

    #save feature importance
    feature_importance.to_csv(
        RESULTS_DIR / f'novel_only_feature_importance_{drug}.csv',
        index=False
    )

    return results, model, feature_importance

In [None]:
#run for all drugs
novel_only_results = []
for drug in ['AMX', 'AMC', 'CIP']:
    result = train_novel_genes_only_model(drug)
    if result:
        novel_only_results.append(result[0])


Training Novel Genes-Only Model for AMX
Data prepared:
  Samples: 1089
  Features (novel genes only): 489
  Resistant: 659 (60.5%)
  Susceptible: 430 (39.5%)

Training configuration:
  Train: 871 samples
  Test: 218 samples
  scale_pos_weight: 0.65
PERFORMANCE (NOVEL GENES ONLY)
AUROC:     0.8878
AUPRC:     0.9394
Precision: 0.9364
Recall:    0.7803
F1 Score:  0.8512

5-Fold CV: 0.9176 ± 0.0146

Classification Report:
              precision    recall  f1-score   support

           S       0.73      0.92      0.81        86
           R       0.94      0.78      0.85       132

    accuracy                           0.83       218
   macro avg       0.83      0.85      0.83       218
weighted avg       0.86      0.83      0.84       218


Computing SHAP values for top features...

Top 10 Novel Genes:
    feature  shap_importance
       tnpR         1.550910
 group_3326         0.415631
        neo         0.238619
group_14256         0.157669
 group_8657         0.157008
group_16388 

In [None]:
print("SUMMARY: Novel Genes-Only Performance")
#save summary
if novel_only_results:
    novel_only_df = pd.DataFrame(novel_only_results)
    print(novel_only_df.to_string(index=False))
    novel_only_df.to_csv(RESULTS_DIR / 'novel_genes_only_summary.csv', index=False)
    print(f"\nResults saved to: {RESULTS_DIR / 'novel_genes_only_summary.csv'}")

SUMMARY: Novel Genes-Only Performance
drug       model_type  n_features  n_samples    auroc    auprc  precision   recall       f1  cv_mean   cv_std
 AMX Novel_Genes_Only         489       1089 0.887773 0.939388   0.936364 0.780303 0.851240 0.917599 0.014611
 AMC Novel_Genes_Only         489       1089 0.824535 0.669613   0.578947 0.676923 0.624113 0.822472 0.040322
 CIP Novel_Genes_Only         495       1089 0.965659 0.952289   0.846154 0.916667 0.880000 0.971863 0.009364

Results saved to: /content/drive/MyDrive/results/additional_experiments/novel_genes_only_summary.csv


## **STATISTICAL SIGNIFICANCE TESTING**

### **DeLong Test**
DeLong test is a statistical method `used to compare the performance of two binary classification models` by `assessing if the difference between their` Area Under the Receiver Operating Characteristic (`ROC`) Curves (AUCs) is statistically significant, especially useful for correlated ROCs like those from nested models.
- It `helps` determine `if one model genuinely outperforms another`, yielding a p-value to decide if the observed difference isn't just due to chance, making it vital for model selection in machine learning and diagnostics.
**`How it works:`**
- **`Compares AUCs:`** It calculates the difference between the AUCs of two models.
- **`Tests Significance:`** It computes the standard error for this difference and uses it to find a p-value.
- **`Interprets Results:`** A small p-value (e.g., < 0.05) suggests a significant performance difference, meaning one model is better.

In [None]:
def delong_test(y_true, y_pred1, y_pred2, n_bootstraps=2000):
    """
    Bootstrap implementation of DeLong test for comparing two AUROCs

    Returns: (difference, p_value, ci_lower, ci_upper)
    """
    y_true = np.array(y_true)
    y_pred1 = np.array(y_pred1)
    y_pred2 = np.array(y_pred2)

    #original AUROCs
    auc1 = roc_auc_score(y_true, y_pred1)
    auc2 = roc_auc_score(y_true, y_pred2)
    observed_diff = auc2 - auc1

    #bootstrap
    bootstrap_diffs = []
    n_samples = len(y_true)

    np.random.seed(42)
    for _ in range(n_bootstraps):
        indices = np.random.choice(n_samples, size=n_samples, replace=True)

        y_boot = y_true[indices]
        pred1_boot = y_pred1[indices]
        pred2_boot = y_pred2[indices]

        try:
            auc1_boot = roc_auc_score(y_boot, pred1_boot)
            auc2_boot = roc_auc_score(y_boot, pred2_boot)
            bootstrap_diffs.append(auc2_boot - auc1_boot)
        except:
            continue

    bootstrap_diffs = np.array(bootstrap_diffs)

    #two-tailed p-value
    p_value = 2 * min(
        np.mean(bootstrap_diffs <= 0),
        np.mean(bootstrap_diffs >= 0)
    )

    #95% confidence interval
    ci_lower = np.percentile(bootstrap_diffs, 2.5)
    ci_upper = np.percentile(bootstrap_diffs, 97.5)

    return observed_diff, p_value, ci_lower, ci_upper

### **`McNemar Test`**
A Statistical Test for Paired Dichotomous DataThe McNemar test is a **non-parametric statistical test** for **paired dichotomous data**, used to see if there's a **significant change in proportions** between two related groups (like before/after treatment or matched pairs). It focuses on the **discordant pairs** (where one result changed, e.g., 'yes' to 'no') in a 2x2 table (cells 'b' and 'c') to check for **marginal homogeneity**, essentially testing if the proportions in the 'before' and 'after' categories are the same.

**`How it Works (The 2x2 Table)`**

Imagine a table for 'Before' vs. 'After' a treatment:

|  | After: Yes | After: No |
| --- | --- | --- |
| **Before: Yes** | a (Yes/Yes) | c (Yes/No) |
| **Before: No** | b (No/Yes) | d (No/No) |


- **a & d:** **Concordant pairs** (no change).
- **b & c:** **Discordant pairs** (change occurred).

The test focuses on **b** and **c**, the changes, to see if the number of 'Yes' to 'No' changes (**c**) is different from 'No' to 'Yes' changes (**b**).

When to Use It* **Before-and-After Studies:** Same subjects measured twice (e.g., opinion on a policy before and after an event).
- **Matched-Pairs Studies:** Similar individuals exposed to different conditions (e.g., comparing two diagnostic tests on the same patients).

In [None]:
def mcnemar_test(y_true, y_pred1, y_pred2):
    """
    McNemar's test for comparing two classifiers
    Tests if error rates are significantly different
    """
    from statsmodels.stats.contingency_tables import mcnemar

    #convert probabilities to binary predictions if needed
    if y_pred1.dtype == float:
        y_pred1 = (y_pred1 > 0.5).astype(int)
    if y_pred2.dtype == float:
        y_pred2 = (y_pred2 > 0.5).astype(int)

    #create contingency table
    #both correct, model1 wrong model2 correct, model1 correct model2 wrong, both wrong
    both_correct = np.sum((y_pred1 == y_true) & (y_pred2 == y_true))
    model1_wrong_model2_correct = np.sum((y_pred1 != y_true) & (y_pred2 == y_true))
    model1_correct_model2_wrong = np.sum((y_pred1 == y_true) & (y_pred2 != y_true))
    both_wrong = np.sum((y_pred1 != y_true) & (y_pred2 != y_true))

    #McNemar's test uses the discordant pairs
    contingency = np.array([[both_correct, model1_correct_model2_wrong],
                            [model1_wrong_model2_correct, both_wrong]])

    result = mcnemar(contingency, exact=False, correction=True)

    return result.statistic, result.pvalue

In [None]:
def compare_all_models():
    """
    Compare Tier 1, Tier 2, Tier 3, and Novel-Only models
    """
    print("\nComparing All Model Tiers...")
    print("Note: This requires predictions saved from previous training runs")

    all_comparisons = []

    for drug in ['AMX', 'AMC', 'CIP']:
        print(f"\n{drug}:")
        print("-" * 60)

        try:
            #load predictions (we need to save these during model training) (we lost the data accidently)
            tier2_pred_file = RESULTS_DIR / f'../tier2_{drug}_predictions.npy'
            tier3_pred_file = RESULTS_DIR / f'../tier3_{drug}_predictions.npy'
            novel_pred_file = RESULTS_DIR / f'novel_only_{drug}_predictions.npy'
            y_test_file = RESULTS_DIR / f'novel_only_{drug}_y_test.npy'

            if not all([f.exists() for f in [novel_pred_file, y_test_file]]):
                print(f"  Missing prediction files. Run models first and save predictions.")
                print(f"     np.save(RESULTS_DIR / 'tier2_{drug}_predictions.npy', y_pred_proba)")
                continue

            y_true = np.load(y_test_file)
            novel_pred = np.load(novel_pred_file)

            # If Tier 2 and Tier 3 predictions exist
            if tier2_pred_file.exists() and tier3_pred_file.exists():
                tier2_pred = np.load(tier2_pred_file)
                tier3_pred = np.load(tier3_pred_file)

                #tier 2 vs Tier 3
                diff_23, p_23, ci_low_23, ci_high_23 = delong_test(
                    y_true, tier2_pred, tier3_pred
                )

                sig_23 = '***' if p_23 < 0.001 else '**' if p_23 < 0.01 else '*' if p_23 < 0.05 else 'ns'

                print(f"  Tier 2 vs Tier 3:")
                print(f"    ΔAUROC = {diff_23:+.4f}, p = {p_23:.4f} {sig_23}")
                print(f"    95% CI: [{ci_low_23:.4f}, {ci_high_23:.4f}]")

                all_comparisons.append({
                    'drug': drug,
                    'comparison': 'Tier2 vs Tier3',
                    'delta_auroc': diff_23,
                    'p_value': p_23,
                    'ci_lower': ci_low_23,
                    'ci_upper': ci_high_23,
                    'significant': sig_23
                })

                #tier 3 vs Novel-Only
                diff_3n, p_3n, ci_low_3n, ci_high_3n = delong_test(
                    y_true, tier3_pred, novel_pred
                )

                sig_3n = '***' if p_3n < 0.001 else '**' if p_3n < 0.01 else '*' if p_3n < 0.05 else 'ns'

                print(f"  Tier 3 vs Novel-Only:")
                print(f"    ΔAUROC = {diff_3n:+.4f}, p = {p_3n:.4f} {sig_3n}")
                print(f"    95% CI: [{ci_low_3n:.4f}, {ci_high_3n:.4f}]")

                all_comparisons.append({
                    'drug': drug,
                    'comparison': 'Tier3 vs NovelOnly',
                    'delta_auroc': diff_3n,
                    'p_value': p_3n,
                    'ci_lower': ci_low_3n,
                    'ci_upper': ci_high_3n,
                    'significant': sig_3n
                })

        except Exception as e:
            print(f"  Error: {e}")
            continue

    if all_comparisons:
        comparison_df = pd.DataFrame(all_comparisons)
        comparison_df.to_csv(RESULTS_DIR / 'statistical_comparisons.csv', index=False)

        print("\n" + "="*80)
        print("STATISTICAL COMPARISON SUMMARY")
        print("="*80)
        print(comparison_df.to_string(index=False))
        print(f"\nSaved to: {RESULTS_DIR / 'statistical_comparisons.csv'}")

        return comparison_df
    else:
        print("\nNo comparisons performed. Save predictions during training:")
        print("   np.save(RESULTS_DIR / 'tier2_{drug}_predictions.npy', y_pred_proba)")
        return None

In [None]:
#run statistical tests
stat_results = compare_all_models()


Comparing All Model Tiers...
Note: This requires predictions saved from previous training runs

AMX:
------------------------------------------------------------

AMC:
------------------------------------------------------------

CIP:
------------------------------------------------------------

No comparisons performed. Save predictions during training:
   np.save(RESULTS_DIR / 'tier2_{drug}_predictions.npy', y_pred_proba)


## **FEATURE INTERACTION ANALYSIS**
 SHAP interaction values to find synergistic gene pairs

In [None]:
def analyze_feature_interactions(drug='AMX', top_n=20):
    """
    Detect important feature interactions using SHAP interaction values
    """
    print(f"Analyzing Feature Interactions for {drug}")

    #load Tier 3 data
    tier2 = pd.read_csv(DATA_DIR / 'tier2_amr_genes_plus_mutations.csv', index_col=0)
    tier2 = fix_sample_ids(tier2)

    roary_file = ROARY_DIR / f'roary_filtered_{drug}_top500_decorrelated_v2.csv'
    if not roary_file.exists():
        print(f"File not found: {roary_file}")
        return None

    roary = pd.read_csv(roary_file, index_col=0)
    roary = fix_sample_ids(roary)

    tier3 = pd.concat([tier2, roary], axis=1, join='inner')

    #load phenotypes
    phenotypes = pd.read_csv(BASE_DIR / 'data/E.coli/phenotypic.csv')
    if 'Isolate' in phenotypes.columns:
        phenotypes = phenotypes.set_index('Isolate')
    phenotypes = fix_sample_ids(phenotypes)

    #prepare data
    common_samples = tier3.index.intersection(phenotypes.index)
    X = tier3.loc[common_samples]
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0}).dropna()
    X = X.loc[y.index]

    # Train a quick model
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    model = XGBClassifier(
        max_depth=5,
        n_estimators=100,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        verbosity=0
    )

    model.fit(X_train, y_train)

    print(f"Computing SHAP interaction values...")
    print(f"  (Using sample of {min(100, len(X_test))} test samples for efficiency)")

    #sample subset for computational efficiency
    sample_size = min(100, len(X_test))
    sample_indices = np.random.choice(len(X_test), size=sample_size, replace=False)
    X_sample = X_test.iloc[sample_indices]

    # SHAP interaction values
    explainer = shap.TreeExplainer(model)
    shap_interaction = explainer.shap_interaction_values(X_sample)

    # Find top interactions
    mean_abs_interaction = np.abs(shap_interaction).mean(axis=0)

    #get upper triangle (avoid duplicates: i < j)
    interactions = []
    n_features = mean_abs_interaction.shape[0]

    for i in range(n_features):
        for j in range(i+1, n_features):
            interactions.append({
                'feature1': X.columns[i],
                'feature2': X.columns[j],
                'interaction_score': mean_abs_interaction[i, j],
                'feature1_type': 'Novel' if X.columns[i] in roary.columns else 'Tier2',
                'feature2_type': 'Novel' if X.columns[j] in roary.columns else 'Tier2'
            })

    #sort by interaction score
    interactions_df = pd.DataFrame(interactions)
    interactions_df = interactions_df.sort_values('interaction_score', ascending=False)

    print(f"\nTop {top_n} Feature Interactions:")
    print(interactions_df.head(top_n).to_string(index=False))

    #save
    interactions_df.to_csv(
        RESULTS_DIR / f'feature_interactions_{drug}.csv',
        index=False
    )

    print(f"\nSaved to: {RESULTS_DIR / f'feature_interactions_{drug}.csv'}")

    #analyze interaction types
    print(f"\nInteraction Type Distribution (Top {top_n}):")
    top_interactions = interactions_df.head(top_n)

    tier2_tier2 = len(top_interactions[
        (top_interactions['feature1_type'] == 'Tier2') &
        (top_interactions['feature2_type'] == 'Tier2')
    ])

    novel_novel = len(top_interactions[
        (top_interactions['feature1_type'] == 'Novel') &
        (top_interactions['feature2_type'] == 'Novel')
    ])

    tier2_novel = top_n - tier2_tier2 - novel_novel

    print(f"  Tier2 × Tier2: {tier2_tier2}")
    print(f"  Novel × Novel: {novel_novel}")
    print(f"  Tier2 × Novel: {tier2_novel}")

    return interactions_df

In [None]:
#run for all drugs
print("\nRunning interaction analysis for all drugs...")
for drug in ['AMX', 'AMC', 'CIP']:
    analyze_feature_interactions(drug, top_n=20)


Running interaction analysis for all drugs...
Analyzing Feature Interactions for AMX
Computing SHAP interaction values...
  (Using sample of 100 test samples for efficiency)

Top 20 Feature Interactions:
   feature1    feature2  interaction_score feature1_type feature2_type
      TEM-4 group_26397           0.107106         Tier2         Novel
      OXA-1       TEM-4           0.086660         Tier2         Tier2
 ftsI_L192F        yehM           0.071380         Tier2         Novel
 ompC_Q196E        yedE           0.064727         Tier2         Novel
      TEM-4        sul1           0.054207         Tier2         Tier2
 ompC_Q196E        sopA           0.052433         Tier2         Novel
      TEM-4 group_11074           0.052408         Tier2         Novel
      TEM-4  group_3820           0.051904         Tier2         Novel
       tnpR      yhjK_2           0.044986         Novel         Novel
 gyrA_P215T       TEM-4           0.044978         Tier2         Tier2
 ompC_G133R gr

# **Novel Genes-Only Model With Lineage Marker Filter**


In [None]:
import pandas as pd
import numpy as np
import requests
import time
from pathlib import Path
try:
  from Bio import SeqIO, Entrez
except:
  !pip install biopython
  from Bio import SeqIO, Entrez
from xml.etree import ElementTree as ET
import warnings
warnings.filterwarnings('ignore')

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [None]:
BASE_DIR = Path('/content/drive/MyDrive')
RESULTS_DIR = BASE_DIR / 'results' / 'additional_experiments'
DATA_DIR = BASE_DIR / 'data' / 'E.coli'

In [None]:
def train_novel_genes_filtered(drug='AMX'):
    """
    Train novel genes-only model WITH lineage marker filtering
    This matches Tier 3 filtering
    """
    from scipy.stats import pearsonr
    from xgboost import XGBClassifier
    from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
    from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score, confusion_matrix
    import shap

    print(f"\n{'='*60}")
    print(f"Novel Genes-Only Model (WITH Lineage Filter) - {drug}")
    print(f"{'='*60}")

    # Helper function
    def standardize_sample_id(sample_id):
        sample_id = str(sample_id).strip()
        if '#' in sample_id:
            return sample_id
        parts = sample_id.rsplit('_', 1)
        return f"{parts[0]}#{parts[1]}" if len(parts) == 2 else sample_id

    def fix_sample_ids(df):
        df.index = df.index.map(standardize_sample_id)
        return df

    #load data
    roary_file = BASE_DIR / 'pangenome_features' / f'roary_filtered_{drug}_top500_decorrelated_v2.csv'
    roary_df = pd.read_csv(roary_file, index_col=0)
    roary_df = fix_sample_ids(roary_df)

    #load Tier 2 for lineage marker filtering
    tier2_file = BASE_DIR / 'amr_features' / 'tier2_amr_genes_plus_mutations.csv'
    tier2_df = pd.read_csv(tier2_file, index_col=0)
    tier2_df = fix_sample_ids(tier2_df)

    print(f"Loaded data:")
    print(f"  Roary genes: {roary_df.shape}")
    print(f"  Tier 2 (for filtering): {tier2_df.shape}")

    #apply lineage marker filter (ρ ≥ 0.70)
    print(f"\nApplying lineage marker filter (ρ ≥ 0.70)...")

    potential_markers = ['rz', 'yedI', 'nmpC', 'gatD', 'betU', 'yeeO']

    genes_to_keep = []
    removed_genes = []

    for gene in roary_df.columns:
        if gene in potential_markers:
            removed_genes.append(gene)
            continue

        #check correlation with Tier 2 features
        max_corr = 0
        for tier2_feature in tier2_df.columns:
            try:
                corr, _ = pearsonr(roary_df[gene], tier2_df[tier2_feature])
                max_corr = max(max_corr, abs(corr))
            except:
                continue

        if max_corr < 0.7:
            genes_to_keep.append(gene)
        else:
            removed_genes.append(gene)

    print(f"  Kept: {len(genes_to_keep)} genes")
    print(f"  Removed: {len(removed_genes)} genes (lineage markers)")
    print(f"  Removed genes: {removed_genes[:10]}...")

    roary_filtered = roary_df[genes_to_keep]

    #load phenotypes
    phenotypes = pd.read_csv(DATA_DIR / 'phenotypic.csv')
    if 'Isolate' in phenotypes.columns:
        phenotypes = phenotypes.set_index('Isolate')
    phenotypes = fix_sample_ids(phenotypes)

    #align samples
    common_samples = roary_filtered.index.intersection(phenotypes.index)
    X = roary_filtered.loc[common_samples]
    y = phenotypes.loc[common_samples, drug].map({'R': 1, 'S': 0, 'I': 0}).dropna()
    X = X.loc[y.index]

    print(f"\nData prepared:")
    print(f"  Samples: {len(X)}")
    print(f"  Features (filtered novel genes): {X.shape[1]}")
    print(f"  Resistant: {(y==1).sum()} ({(y==1).sum()/len(y)*100:.1f}%)")

    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    #train model
    model = XGBClassifier(
        max_depth=5,
        n_estimators=100,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        min_child_weight=3,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        eval_metric='auc',
        verbosity=0
    )

    model.fit(X_train, y_train)

    #evaluate
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)

    auroc = roc_auc_score(y_test, y_pred_proba)
    auprc = average_precision_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    #cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)


    print(f"PERFORMANCE (FILTERED NOVEL GENES ONLY)")
    print(f"AUROC:     {auroc:.4f}")
    print(f"AUPRC:     {auprc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"\n5-Fold CV: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    #SHAP analysis
    print(f"\nComputing SHAP values...")
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'shap_importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)

    print(f"\nTop 10 Novel Genes (after lineage filtering):")
    print(feature_importance.head(10).to_string(index=False))

    #save
    results = {
        'drug': drug,
        'model_type': 'Novel_Genes_Filtered',
        'n_features': X.shape[1],
        'n_features_removed': len(removed_genes),
        'n_samples': len(X),
        'auroc': auroc,
        'auprc': auprc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }

    feature_importance.to_csv(
        RESULTS_DIR / f'novel_filtered_feature_importance_{drug}.csv',
        index=False
    )

    return results, feature_importance, removed_genes

In [None]:
#run for all drugs
print("TRAINING FILTERED NOVEL GENES-ONLY MODELS")

filtered_results = []
for drug in ['AMX', 'AMC', 'CIP']:
    result = train_novel_genes_filtered(drug)
    if result:
        filtered_results.append(result[0])

TRAINING FILTERED NOVEL GENES-ONLY MODELS

Novel Genes-Only Model (WITH Lineage Filter) - AMX
Loaded data:
  Roary genes: (1089, 489)
  Tier 2 (for filtering): (1089, 1236)

Applying lineage marker filter (ρ ≥ 0.70)...
  Kept: 179 genes
  Removed: 310 genes (lineage markers)
  Removed genes: ['tnpR', 'group_8657', 'neo', 'yedA_2', 'group_5885', 'group_17014', 'group_18191', 'group_21010', 'group_15443', 'group_14300']...

Data prepared:
  Samples: 1089
  Features (filtered novel genes): 179
  Resistant: 659 (60.5%)
PERFORMANCE (FILTERED NOVEL GENES ONLY)
AUROC:     0.8623
AUPRC:     0.9169
Precision: 0.8750
Recall:    0.7955
F1 Score:  0.8333

5-Fold CV: 0.8883 ± 0.0147

Computing SHAP values...

Top 10 Novel Genes (after lineage filtering):
    feature  shap_importance
 group_3326         1.082576
       pemK         0.292495
       intI         0.242284
       yehM         0.191942
group_11074         0.160231
group_16687         0.136894
 group_8890         0.136159
 group_3820     

In [None]:
#compare with unfiltered
print("COMPARISON: Unfiltered vs Filtered Novel Genes")

comparison_data = {
    'Drug': ['AMX', 'AMC', 'CIP', 'AMX', 'AMC', 'CIP'],
    'Model': ['Unfiltered']*3 + ['Filtered']*3,
    'AUROC': [0.888, 0.825, 0.966,
              filtered_results[0]['auroc'], filtered_results[1]['auroc'], filtered_results[2]['auroc']],
    'F1': [0.851, 0.624, 0.880,
           filtered_results[0]['f1'], filtered_results[1]['f1'], filtered_results[2]['f1']],
    'N_Features': [489, 489, 495,
                   filtered_results[0]['n_features'], filtered_results[1]['n_features'], filtered_results[2]['n_features']]
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

comparison_df.to_csv(RESULTS_DIR / 'novel_genes_comparison.csv', index=False)

COMPARISON: Unfiltered vs Filtered Novel Genes
Drug      Model    AUROC       F1  N_Features
 AMX Unfiltered 0.888000 0.851000         489
 AMC Unfiltered 0.825000 0.624000         489
 CIP Unfiltered 0.966000 0.880000         495
 AMX   Filtered 0.862315 0.833333         179
 AMC   Filtered 0.819256 0.637681         118
 CIP   Filtered 0.940171 0.857143          88
