In [None]:
import csv
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.dummy import DummyClassifier
import xgboost as xgb
import random
import zipfile
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap
import seaborn as sns

In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

In [None]:
# ====================================
# Step 1: Load Data
# ====================================
# Set the dataset name
csv_path_train = "/content/drive/MyDrive/AllergenAI-new/CNN/algpred2_train_esm2_1280dim_embeddings.csv"
csv_path_test = "/content/drive/MyDrive/AllergenAI-new/CNN/algpred2_test_esm2_1280dim_embeddings.csv"
# Construct embedding file paths based on dataset name
embedding_files = {"train": csv_path_train, "test": csv_path_test}

# Load the data
df_train = pd.read_csv(embedding_files["train"])
df_test = pd.read_csv(embedding_files["test"])


feature_cols = [f"f{i}" for i in range(1280)]
X_train_full = df_train[feature_cols].values
y_train_full = df_train["label"].values

X_test = df_test[feature_cols].values
y_test = df_test["label"].values

print(f"✅ Loaded: Train={X_train_full.shape}, Test={X_test.shape}")

# ====================================
# Step 2: Dummy Classifier Baseline (on Train)
# ====================================
print("\n📉 DummyClassifier (Stratified) on Training Set (CV):\n")
dummy = DummyClassifier(strategy="stratified", random_state=42)
dummy_aucs = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in cv.split(X_train_full, y_train_full):
    dummy.fit(X_train_full[train_idx], y_train_full[train_idx])
    y_dummy_proba = dummy.predict_proba(X_train_full[val_idx])[:, 1]
    auc = roc_auc_score(y_train_full[val_idx], y_dummy_proba)
    dummy_aucs.append(auc)

print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

# ====================================
# Step 3: Cross-Validation on Training Set (XGBoost)
# ====================================
print("\n🚀 5-Fold Cross-Validation (XGBoost) on Training Set...\n")
xgb_aucs = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full)):
    X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
    y_train, y_val = y_train_full[train_idx], y_train_full[val_idx]

    clf = xgb.XGBClassifier(
        use_label_encoder=False, eval_metric="logloss", random_state=42
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    y_proba = clf.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_proba)
    xgb_aucs.append(auc)

    print(f"📂 Fold {fold+1} AUC: {auc:.4f}")
    print(classification_report(y_val, y_pred, digits=4))
    print("------")

print(f"\n✅ Mean CV ROC-AUC: {np.mean(xgb_aucs):.4f} ± {np.std(xgb_aucs):.4f}")

# ====================================
# Step 4: Final Test Set Evaluation
# ====================================
print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
clf_final = xgb.XGBClassifier(
    use_label_encoder=False, eval_metric="logloss", random_state=42
)
clf_final.fit(X_train_full, y_train_full)

y_test_pred = clf_final.predict(X_test)
y_test_proba = clf_final.predict_proba(X_test)[:, 1]

test_auc = roc_auc_score(y_test, y_test_proba)
print(classification_report(y_test, y_test_pred, digits=4))
print(f"🎯 Final Test ROC-AUC: {test_auc:.4f}")

# ====================================
# Step 5: Y-Scrambling Control
# ====================================
print("\n🧪 Y-Scrambling (sanity check) on Training Set...\n")
y_scrambled = y_train_full.copy()
random.seed(42)
random.shuffle(y_scrambled)

scrambled_aucs = []
for train_idx, val_idx in cv.split(X_train_full, y_scrambled):
    X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
    y_train, y_val = y_scrambled[train_idx], y_scrambled[val_idx]

    clf_scrambled = xgb.XGBClassifier(
        use_label_encoder=False, eval_metric="logloss", random_state=42
    )
    clf_scrambled.fit(X_train, y_train)
    y_proba_scrambled = clf_scrambled.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_proba_scrambled)
    scrambled_aucs.append(auc)

print(
    f"🔀 Y-Scrambled ROC-AUC: {np.mean(scrambled_aucs):.4f} ± {np.std(scrambled_aucs):.4f}"
)
print("👉 This should be near 0.5 if your real model learned something.")

In [None]:
# ====================================
# Step X+: PCA and UMAP with Prediction Results
# ====================================
print("\n🔍 PCA and UMAP Visualization with Predictions...\n")

# First, recompute PCA and UMAP on X_test
pca = PCA(n_components=2, random_state=42)
X_test_pca = pca.fit_transform(X_test)

reducer = umap.UMAP(n_components=2, random_state=42)
X_test_umap = reducer.fit_transform(X_test)

# Determine correct vs incorrect predictions
correct = y_test_pred == y_test

# Now plot PCA
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=X_test_pca[:, 0],
    y=X_test_pca[:, 1],
    hue=correct,
    palette={True: "green", False: "red"},
    alpha=0.7,
)
plt.title("PCA Projection (Test Set) - Correct vs Incorrect")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.legend(title="Prediction Correct")
plt.grid(True)
plt.show()

# Now plot UMAP
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=X_test_umap[:, 0],
    y=X_test_umap[:, 1],
    hue=correct,
    palette={True: "green", False: "red"},
    alpha=0.7,
)
plt.title("UMAP Projection (Test Set) - Correct vs Incorrect")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.legend(title="Prediction Correct")
plt.grid(True)
plt.show()