## imports

In [13]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import torch
import esm
from tqdm import tqdm
import os
import csv
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.dummy import DummyClassifier
import xgboost as xgb
import random
import zipfile
import io

# Data curation

## User input: choose which dataset to evaluate 

In [14]:
# Choose between 'benchmark' or 'algpred2' for now only those two
# dataset_name = "benchmark"
dataset_name = "algpred2"

## Benchmark data

In [15]:
if dataset_name == "benchmark":
    # Define base folder and output paths
    data_dir = dataset_name
    os.makedirs(data_dir, exist_ok=True)

    train_csv_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
    test_csv_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

    # Check for existing files
    if os.path.exists(train_csv_path) or os.path.exists(test_csv_path):
        user_input = (
            input(
                f"⚠️ Files already exist in '{data_dir}/'. Do you want to overwrite them? (y/n): "
            )
            .strip()
            .lower()
        )
        if user_input != "y":
            print("⏭️  Skipping FASTA parsing and CSV generation.")
        else:
            proceed = True
    else:
        proceed = True

    if "proceed" in locals() and proceed:
        # Define file-label mapping and GitHub raw URLs
        base_url = "https://raw.githubusercontent.com/Jeffateth/AllergenPredict/7fafbea0ab1646796abe40cafb800c46ba842bda/Benchmark_dataset"

        datasets = {
            "train_p.fasta": (1, "train"),
            "train_n.fasta": (0, "train"),
            "test_p.fasta": (1, "test"),
            "test_n.fasta": (0, "test"),
        }

        # Parse FASTA format
        def parse_fasta(fasta_text, label):
            sequences = []
            current_id = None
            current_seq = ""
            for line in fasta_text.strip().splitlines():
                line = line.strip()
                if line.startswith(">"):
                    if current_id is not None:
                        sequences.append((current_id, current_seq, label))
                    current_id = line[1:]
                    current_seq = ""
                else:
                    current_seq += line
            if current_id and current_seq:
                sequences.append((current_id, current_seq, label))
            return sequences

        # Download and parse files
        train_entries = []
        test_entries = []

        for filename, (label, split) in datasets.items():
            url = f"{base_url}/{filename}"
            print(f"⬇️  Downloading {filename} from {url}...")
            response = requests.get(url)
            response.raise_for_status()  # raise an error for failed downloads

            fasta_text = response.text
            entries = parse_fasta(fasta_text, label)
            if split == "train":
                train_entries.extend(entries)
            else:
                test_entries.extend(entries)

        # Save to CSV inside dataset-named folder
        df_train = pd.DataFrame(train_entries, columns=["id", "sequence", "label"])
        df_test = pd.DataFrame(test_entries, columns=["id", "sequence", "label"])

        df_train.to_csv(train_csv_path, index=False)
        df_test.to_csv(test_csv_path, index=False)

        print(f"✅ Saved training set to '{train_csv_path}'")
        print(f"✅ Saved testing set to '{test_csv_path}'")

## Data from AllergenAI (need to update this code to fit)

In [16]:
# # Standard amino acid order (1-letter codes)
# aa_letters = list("ACDEFGHIKLMNPQRSTVWY")

# # Map one-hot vector to amino acid letter
# onehot_to_aa = {
#     tuple(1 if i == j else 0 for i in range(20)): aa
#     for j, aa in enumerate(aa_letters)
# }

# def load_onehot_file(filepath, label):
#     """Converts one-hot file to list of (sequence, label)"""
#     data = np.loadtxt(filepath)
#     sequences = []
#     current = []

#     for row in data:
#         if np.all(row == 0):
#             if current:
#                 sequences.append(("".join(current), label))
#                 current = []
#         else:
#             aa = onehot_to_aa.get(tuple(int(x) for x in row))
#             if aa:
#                 current.append(aa)
#             else:
#                 raise ValueError(f"Unknown one-hot vector: {row}")

#     if current:
#         sequences.append(("".join(current), label))

#     return sequences

# # === Load both files ===
# positive_sequences = load_onehot_file("pos.txt", label=1)
# negative_sequences = load_onehot_file("neg.txt", label=0)

# # === Combine and save ===
# all_sequences = positive_sequences + negative_sequences
# df = pd.DataFrame(all_sequences, columns=["sequence", "label"])
# df["id"] = range(len(df))

# # Reorder columns if you want: id, sequence, label
# df = df[["id", "sequence", "label"]]

# # Save it back
# df.to_csv("converted_onehot_sequences.csv", index=False)
# print("✅ Saved as 'converted_onehot_sequences.csv'")

## Data from AlgPred 2.0

In [17]:
if dataset_name == "algpred2":
    # Define output paths inside dataset-named directory
    data_dir = dataset_name
    os.makedirs(data_dir, exist_ok=True)

    train_csv_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
    test_csv_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

    # Check if files already exist
    if os.path.exists(train_csv_path) or os.path.exists(test_csv_path):
        user_input = (
            input(
                f"⚠️ Files already exist in '{data_dir}/'. Do you want to overwrite them? (y/n): "
            )
            .strip()
            .lower()
        )
        if user_input != "y":
            print("⏭️  Skipping download and parsing.")
        else:
            proceed = True
    else:
        proceed = True

    if "proceed" in locals() and proceed:
        # URLs from AlgPred 2.0
        datasets = {
            "train_positive": (
                "https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_positive.txt",
                1,
                "train",
            ),
            "train_negative": (
                "https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_negative.txt",
                0,
                "train",
            ),
            "validation_positive": (
                "https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_positive.txt",
                1,
                "val",
            ),
            "validation_negative": (
                "https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_negative.txt",
                0,
                "val",
            ),
        }

        # Parse FASTA format
        def parse_fasta(fasta_text, label):
            sequences = []
            current_id = None
            current_seq = ""
            for line in fasta_text.strip().splitlines():
                line = line.strip()
                if line.startswith(">"):
                    if current_id is not None:
                        sequences.append((current_id, current_seq, label))
                    current_id = line[1:]  # remove ">"
                    current_seq = ""
                else:
                    current_seq += line
            if current_id and current_seq:
                sequences.append((current_id, current_seq, label))
            return sequences

        # Split into train and validation entries
        train_entries = []
        val_entries = []

        for name, (url, label, split) in datasets.items():
            print(f"⬇️  Downloading {name} from {url}...")
            response = requests.get(url)
            entries = parse_fasta(response.text, label)
            if split == "train":
                train_entries.extend(entries)
            else:
                val_entries.extend(entries)

        # Convert to DataFrames
        df_train = pd.DataFrame(train_entries, columns=["id", "sequence", "label"])
        df_val = pd.DataFrame(val_entries, columns=["id", "sequence", "label"])

        # Save to CSV inside dataset folder
        df_train.to_csv(train_csv_path, index=False)
        df_val.to_csv(test_csv_path, index=False)

        print(f"✅ Saved training set to '{train_csv_path}'")
        print(f"✅ Saved validation set to '{test_csv_path}'")

⬇️  Downloading train_positive from https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_positive.txt...
⬇️  Downloading train_negative from https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_negative.txt...
⬇️  Downloading validation_positive from https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_positive.txt...
⬇️  Downloading validation_negative from https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_negative.txt...
✅ Saved training set to 'algpred2/algpred2_train.csv'
✅ Saved validation set to 'algpred2/algpred2_test.csv'


# ESM-2 embedding extraction

In [18]:
# === CONFIG ===
feature_dim = 320  # ESM-2 T6-8M embedding size
batch_size = 1  # Adjust based on memory
data_dir = dataset_name  # All files live in a folder named after the dataset

# --- Ensure directory exists ---
os.makedirs(data_dir, exist_ok=True)

# --- Construct dynamic file paths ---
input_files = {
    "train": os.path.join(data_dir, f"{dataset_name}_train.csv"),
    "test": os.path.join(data_dir, f"{dataset_name}_test.csv"),
}

# --- Output file paths ---
embedding_files = {
    "train": os.path.join(data_dir, f"train_{dataset_name}_esm2_embeddings.csv"),
    "test": os.path.join(data_dir, f"test_{dataset_name}_esm2_embeddings.csv"),
}

# Check if both embedding files exist
if all(os.path.exists(f) for f in embedding_files.values()):
    print(
        f"✅ ESM2 embedding files already exist in '{data_dir}/'. Skipping embedding generation."
    )
else:
    # --- Load ESM-2 model ---
    model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
    batch_converter = alphabet.get_batch_converter()
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # --- Helper function ---
    def process_file(split_name, input_file):
        temp_file = os.path.join(
            data_dir, f"{split_name}_{dataset_name}_esm2_embeddings_temp.csv"
        )
        final_file = os.path.join(
            data_dir, f"{split_name}_{dataset_name}_esm2_embeddings.csv"
        )

        # Load dataset
        df = pd.read_csv(input_file)
        sequences = list(df["sequence"])
        labels = list(df["label"])
        ids = list(df["id"])

        # Resume support
        if os.path.exists(temp_file):
            processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
            print(
                f"🔁 Resuming {split_name} from {temp_file} — {len(processed_ids)} entries already processed."
            )
        else:
            processed_ids = set()

        remaining_data = [
            (ids[i], sequences[i], labels[i])
            for i in range(len(ids))
            if ids[i] not in processed_ids
        ]

        # Output format
        fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]
        write_header = not os.path.exists(temp_file)

        print(
            f"⚙️  Extracting embeddings for {split_name} set... ({len(remaining_data)} sequences remaining)"
        )

        with open(temp_file, mode="a", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            if write_header:
                writer.writeheader()

            for i in tqdm(range(0, len(remaining_data), batch_size)):
                batch = remaining_data[i : i + batch_size]
                batch_ids = [x[0] for x in batch]
                batch_seqs = [x[1] for x in batch]
                batch_labels = [x[2] for x in batch]

                batch_data = [
                    (batch_ids[j], batch_seqs[j]) for j in range(len(batch_seqs))
                ]
                _, _, batch_tokens = batch_converter(batch_data)
                batch_tokens = batch_tokens.to(device)

                with torch.no_grad():
                    outputs = model(batch_tokens, repr_layers=[6])
                    token_representations = outputs["representations"][6]

                rows = []
                for j, (_, seq) in enumerate(batch_data):
                    representation = token_representations[j, 1 : len(seq) + 1].mean(0)
                    entry = {
                        "id": batch_ids[j],
                        "label": batch_labels[j],
                    }
                    for k in range(feature_dim):
                        entry[f"f{k}"] = representation[k].item()
                    rows.append(entry)

                writer.writerows(rows)

        # Final save
        os.replace(temp_file, final_file)
        print(f"✅ Final {split_name} embeddings saved to '{final_file}'")

    # --- Process each split ---
    for split, file in input_files.items():
        process_file(split, file)

⚙️  Extracting embeddings for train set... (16120 sequences remaining)


100%|██████████| 16120/16120 [25:20<00:00, 10.60it/s]  


✅ Final train embeddings saved to 'algpred2/train_algpred2_esm2_embeddings.csv'
⚙️  Extracting embeddings for test set... (4030 sequences remaining)


100%|██████████| 4030/4030 [06:08<00:00, 10.93it/s]

✅ Final test embeddings saved to 'algpred2/test_algpred2_esm2_embeddings.csv'





## Model (XGBoosted)

In [19]:
# ====================================
# Step 1: Load Data
# ====================================
# Set the dataset name

# Construct embedding file paths based on dataset name
embedding_files = {
    "train": f"train_{dataset_name}_esm2_embeddings.csv",
    "test": f"test_{dataset_name}_esm2_embeddings.csv",
}

# Load the data
df_train = pd.read_csv(embedding_files["train"])
df_test = pd.read_csv(embedding_files["test"])


feature_cols = [f"f{i}" for i in range(320)]
X_train_full = df_train[feature_cols].values
y_train_full = df_train["label"].values

X_test = df_test[feature_cols].values
y_test = df_test["label"].values

print(f"✅ Loaded: Train={X_train_full.shape}, Test={X_test.shape}")

# ====================================
# Step 2: Dummy Classifier Baseline (on Train)
# ====================================
print("\n📉 DummyClassifier (Stratified) on Training Set (CV):\n")
dummy = DummyClassifier(strategy="stratified", random_state=42)
dummy_aucs = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in cv.split(X_train_full, y_train_full):
    dummy.fit(X_train_full[train_idx], y_train_full[train_idx])
    y_dummy_proba = dummy.predict_proba(X_train_full[val_idx])[:, 1]
    auc = roc_auc_score(y_train_full[val_idx], y_dummy_proba)
    dummy_aucs.append(auc)

print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

# ====================================
# Step 3: Cross-Validation on Training Set (XGBoost)
# ====================================
print("\n🚀 5-Fold Cross-Validation (XGBoost) on Training Set...\n")
xgb_aucs = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full)):
    X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
    y_train, y_val = y_train_full[train_idx], y_train_full[val_idx]

    clf = xgb.XGBClassifier(
        use_label_encoder=False, eval_metric="logloss", random_state=42
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    y_proba = clf.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_proba)
    xgb_aucs.append(auc)

    print(f"📂 Fold {fold+1} AUC: {auc:.4f}")
    print(classification_report(y_val, y_pred, digits=4))
    print("------")

print(f"\n✅ Mean CV ROC-AUC: {np.mean(xgb_aucs):.4f} ± {np.std(xgb_aucs):.4f}")

# ====================================
# Step 4: Final Test Set Evaluation
# ====================================
print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
clf_final = xgb.XGBClassifier(
    use_label_encoder=False, eval_metric="logloss", random_state=42
)
clf_final.fit(X_train_full, y_train_full)

y_test_pred = clf_final.predict(X_test)
y_test_proba = clf_final.predict_proba(X_test)[:, 1]

test_auc = roc_auc_score(y_test, y_test_proba)
print(classification_report(y_test, y_test_pred, digits=4))
print(f"🎯 Final Test ROC-AUC: {test_auc:.4f}")

# ====================================
# Step 5: Y-Scrambling Control
# ====================================
print("\n🧪 Y-Scrambling (sanity check) on Training Set...\n")
y_scrambled = y_train_full.copy()
random.seed(42)
random.shuffle(y_scrambled)

scrambled_aucs = []
for train_idx, val_idx in cv.split(X_train_full, y_scrambled):
    X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
    y_train, y_val = y_scrambled[train_idx], y_scrambled[val_idx]

    clf_scrambled = xgb.XGBClassifier(
        use_label_encoder=False, eval_metric="logloss", random_state=42
    )
    clf_scrambled.fit(X_train, y_train)
    y_proba_scrambled = clf_scrambled.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_proba_scrambled)
    scrambled_aucs.append(auc)

print(
    f"🔀 Y-Scrambled ROC-AUC: {np.mean(scrambled_aucs):.4f} ± {np.std(scrambled_aucs):.4f}"
)
print("👉 This should be near 0.5 if your real model learned something.")

✅ Loaded: Train=(16120, 320), Test=(4030, 320)

📉 DummyClassifier (Stratified) on Training Set (CV):

📊 Dummy ROC-AUC: 0.4991 ± 0.0000

🚀 5-Fold Cross-Validation (XGBoost) on Training Set...





📂 Fold 1 AUC: 0.9957
              precision    recall  f1-score   support

           0     0.9818    0.9708    0.9763      1612
           1     0.9712    0.9820    0.9766      1612

    accuracy                         0.9764      3224
   macro avg     0.9765    0.9764    0.9764      3224
weighted avg     0.9765    0.9764    0.9764      3224

------




📂 Fold 2 AUC: 0.9972
              precision    recall  f1-score   support

           0     0.9801    0.9764    0.9782      1612
           1     0.9765    0.9801    0.9783      1612

    accuracy                         0.9783      3224
   macro avg     0.9783    0.9783    0.9783      3224
weighted avg     0.9783    0.9783    0.9783      3224

------




📂 Fold 3 AUC: 0.9978
              precision    recall  f1-score   support

           0     0.9838    0.9789    0.9813      1612
           1     0.9790    0.9839    0.9814      1612

    accuracy                         0.9814      3224
   macro avg     0.9814    0.9814    0.9814      3224
weighted avg     0.9814    0.9814    0.9814      3224

------




📂 Fold 4 AUC: 0.9958
              precision    recall  f1-score   support

           0     0.9795    0.9789    0.9792      1612
           1     0.9789    0.9795    0.9792      1612

    accuracy                         0.9792      3224
   macro avg     0.9792    0.9792    0.9792      3224
weighted avg     0.9792    0.9792    0.9792      3224

------




📂 Fold 5 AUC: 0.9976
              precision    recall  f1-score   support

           0     0.9855    0.9671    0.9762      1612
           1     0.9677    0.9857    0.9766      1612

    accuracy                         0.9764      3224
   macro avg     0.9766    0.9764    0.9764      3224
weighted avg     0.9766    0.9764    0.9764      3224

------

✅ Mean CV ROC-AUC: 0.9968 ± 0.0009

🔒 Final Evaluation on Hold-Out Test Set...





              precision    recall  f1-score   support

           0     0.7369    0.9811    0.8416      2015
           1     0.9718    0.6496    0.7787      2015

    accuracy                         0.8154      4030
   macro avg     0.8543    0.8154    0.8102      4030
weighted avg     0.8543    0.8154    0.8102      4030

🎯 Final Test ROC-AUC: 0.9477

🧪 Y-Scrambling (sanity check) on Training Set...





🔀 Y-Scrambled ROC-AUC: 0.4974 ± 0.0093
👉 This should be near 0.5 if your real model learned something.
