In [None]:
pip install pandas requests

In [1]:
import pandas as pd
import requests
import os # Ensure the script is run in a directory where you have write permissions

# URLs
datasets = {
    "train_positive": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_positive.txt", 1),
    "train_negative": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_negative.txt", 0),
    "test_positive": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_positive.txt", 1),
    "test_negative": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_negative.txt", 0)
}

# Fasta parser
def parse_fasta(text, label):
    entries, cur_id, cur_seq = [], None, ""
    for line in text.strip().splitlines():
        line = line.strip()
        if line.startswith(">"):
            if cur_id:
                entries.append((cur_id, cur_seq, label))
            cur_id = line[1:]
            cur_seq = ""
        else:
            cur_seq += line
    if cur_id and cur_seq:
        entries.append((cur_id, cur_seq, label))
    return entries

# Process each
for setname in ["train", "test"]:
    entries = []
    for k, (url, label) in datasets.items():
        if k.startswith(setname):
            print(f"⬇️ Downloading {k}")
            r = requests.get(url)
            entries.extend(parse_fasta(r.text, label))
    df = pd.DataFrame(entries, columns=["id", "sequence", "label"])
    df.to_csv(f"algpred2_{setname}.csv", index=False)
    print(f"✅ Saved {setname} set: {len(df)} entries")
    print("Saved files in:", os.getcwd())


⬇️ Downloading train_positive
⬇️ Downloading train_negative
✅ Saved train set: 16120 entries
Saved files in: /Users/jianzhouyao/AllergenPredict
⬇️ Downloading test_positive
⬇️ Downloading test_negative
✅ Saved test set: 4030 entries
Saved files in: /Users/jianzhouyao/AllergenPredict


In [None]:
!pip install fair-esm torch pandas tqdm

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
import pandas as pd
import torch
import esm
from tqdm import tqdm
import os
import csv

for dataset in ["algpred2_train.csv", "algpred2_test.csv"]:
    # Determine base name
    base = os.path.splitext(os.path.basename(dataset))[0]

    # Load dataset
    df = pd.read_csv(dataset)
    sequences = list(df["sequence"])
    labels = list(df["label"])
    ids = list(df["id"])

    # Load ESM-2 model
    model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
    batch_converter = alphabet.get_batch_converter()
    model.eval()

    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Output files (unique per dataset)
    temp_file = f"{base}_esm2_embeddings_temp.csv"
    final_file = f"{base}_esm2_embeddings.csv"

    # Already processed IDs (for resuming)
    if os.path.exists(temp_file):
        processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
        print(f"🔁 Resuming from {temp_file} — {len(processed_ids)} entries already processed.")
    else:
        processed_ids = set()

    # Filter data
    remaining_data = [(ids[i], sequences[i], labels[i]) for i in range(len(ids)) if ids[i] not in processed_ids]

    # Batch setup
    batch_size = 1
    write_header = not os.path.exists(temp_file)
    feature_dim = 1280
    fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]

    print(f"⚙️  Extracting embeddings using ESM-2 for {dataset}... ({len(remaining_data)} sequences remaining)")

    with open(temp_file, mode="a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if write_header:
            writer.writeheader()

        for i in tqdm(range(0, len(remaining_data), batch_size)):
            batch = remaining_data[i:i + batch_size]
            batch_ids = [x[0] for x in batch]
            batch_seqs = [x[1] for x in batch]
            batch_labels = [x[2] for x in batch]

            batch_data = [(batch_ids[j], batch_seqs[j]) for j in range(len(batch_seqs))]
            _, _, batch_tokens = batch_converter(batch_data)
            batch_tokens = batch_tokens.to(device)

            with torch.no_grad():
                outputs = model(batch_tokens, repr_layers=[33])  # Layer 33 for T33
                token_representations = outputs["representations"][33]

            rows = []
            for j, (_, seq) in enumerate(batch_data):
                representation = token_representations[j, 1:len(seq)+1].mean(0)
                entry = {
                    "id": batch_ids[j],
                    "label": batch_labels[j],
                }
                for k in range(feature_dim):
                    entry[f"f{k}"] = representation[k].item()
                rows.append(entry)

            writer.writerows(rows)

    # Final save
    os.replace(temp_file, final_file)
    print(f"✅ Final embeddings saved to '{final_file}'\n")


🔁 Resuming from algpred2_train_esm2_embeddings_temp.csv — 1716 entries already processed.
⚙️  Extracting embeddings using ESM-2 for algpred2_train.csv... (14404 sequences remaining)


100%|██████████| 14404/14404 [50:54<00:00,  4.72it/s]


✅ Final embeddings saved to 'algpred2_train_esm2_embeddings.csv'

⚙️  Extracting embeddings using ESM-2 for algpred2_test.csv... (4030 sequences remaining)


100%|██████████| 4030/4030 [13:19<00:00,  5.04it/s]

✅ Final embeddings saved to 'algpred2_test_esm2_embeddings.csv'






In [None]:
import pandas as pd

df = pd.read_csv("algpred2_train.csv")
print(df.head())
print(df.shape)  # Rows, features

     id                                           sequence  label
0  P_13  MGKPFTLSLSSLCLLLLSSACFAISSSKLNECQLNNLNALEPDHRV...      1
1  P_14  MGVFTFEDEINSPVAPATLYKALVTDADNVIPKALDSFKSVENVEG...      1
2  P_17  MAEDEDNQQGQGEGLKYLGFVQDAATYAVTTFSNVYLFAKDKSGPL...      1
3  P_46  MGVFNYEVETPSVISAARLFKSYVLDGDKLIPKVAPQAITSVENVG...      1
4  P_47  MGVFNYEVETPSVIPAARLFKSYVLDGDKLIPKVAPQAITSVENVE...      1
(16120, 3)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.dummy import DummyClassifier
import xgboost as xgb
import random

# ====================================
# Step 1: Load Data (Proper Separation)
# ====================================
df_train = pd.read_csv("algpred2_train_esm2_embeddings.csv")
df_test = pd.read_csv("algpred2_test_esm2_embeddings.csv")

feature_cols = [f"f{i}" for i in range(1280)]  # Adjust if using larger model
X_temp = df_train[feature_cols].values
y_temp = df_train["label"].values

X_test = df_test[feature_cols].values
y_test = df_test["label"].values

print(f"📁 Train+Val size: {X_temp.shape}, Test size: {X_test.shape}")


# ====================================
# Step 3: Dummy Classifier Baseline (on Train+Val)
# ====================================
print("\n📉 DummyClassifier (Stratified) on Train+Val:\n")
dummy = DummyClassifier(strategy="stratified", random_state=42)
dummy_aucs = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in cv.split(X_temp, y_temp):
    dummy.fit(X_temp[train_idx], y_temp[train_idx])
    y_dummy_proba = dummy.predict_proba(X_temp[val_idx])[:, 1]
    auc = roc_auc_score(y_temp[val_idx], y_dummy_proba)
    dummy_aucs.append(auc)

print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

# ====================================
# Step 4: Cross-Validation on Train+Val (XGBoost)
# ====================================
print("\n🚀 5-Fold Cross-Validation (XGBoost) on Train+Val...\n")
xgb_aucs = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_temp, y_temp)):
    X_train, X_val = X_temp[train_idx], X_temp[val_idx]
    y_train, y_val = y_temp[train_idx], y_temp[val_idx]

    clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    y_proba = clf.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_proba)
    xgb_aucs.append(auc)

    print(f"📂 Fold {fold+1} AUC: {auc:.4f}")
    print(classification_report(y_val, y_pred, digits=4))
    print("------")

print(f"\n✅ Mean CV ROC-AUC: {np.mean(xgb_aucs):.4f} ± {np.std(xgb_aucs):.4f}")

# ====================================
# Step 5: Final Test Set Evaluation
# ====================================
print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
clf_final = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
clf_final.fit(X_temp, y_temp)

y_test_pred = clf_final.predict(X_test)
y_test_proba = clf_final.predict_proba(X_test)[:, 1]

test_auc = roc_auc_score(y_test, y_test_proba)
print(classification_report(y_test, y_test_pred, digits=4))
print(f"🎯 Final Test ROC-AUC: {test_auc:.4f}")

# ====================================
# Step 6: Y-Scrambling Control
# ====================================
print("\n🧪 Y-Scrambling (sanity check) on Train+Val...\n")
y_temp_scrambled = y_temp.copy()
random.seed(42)
random.shuffle(y_temp_scrambled)

scrambled_aucs = []
for train_idx, val_idx in cv.split(X_temp, y_temp_scrambled):
    X_train, X_val = X_temp[train_idx], X_temp[val_idx]
    y_train, y_val = y_temp_scrambled[train_idx], y_temp_scrambled[val_idx]

    clf_scrambled = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
    clf_scrambled.fit(X_train, y_train)
    y_proba_scrambled = clf_scrambled.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_proba_scrambled)
    scrambled_aucs.append(auc)

print(f"🔀 Y-Scrambled ROC-AUC: {np.mean(scrambled_aucs):.4f} ± {np.std(scrambled_aucs):.4f}")
print("👉 This should be near 0.5 if your real model learned something.")


📁 Train+Val size: (16120, 1280), Test size: (4030, 1280)

📉 DummyClassifier (Stratified) on Train+Val:

📊 Dummy ROC-AUC: 0.4991 ± 0.0000

🚀 5-Fold Cross-Validation (XGBoost) on Train+Val...



Parameters: { "use_label_encoder" } are not used.



📂 Fold 1 AUC: 0.9977
              precision    recall  f1-score   support

           0     0.9900    0.9845    0.9872      1612
           1     0.9846    0.9901    0.9873      1612

    accuracy                         0.9873      3224
   macro avg     0.9873    0.9873    0.9873      3224
weighted avg     0.9873    0.9873    0.9873      3224

------


Parameters: { "use_label_encoder" } are not used.



📂 Fold 2 AUC: 0.9982
              precision    recall  f1-score   support

           0     0.9863    0.9845    0.9854      1612
           1     0.9845    0.9864    0.9854      1612

    accuracy                         0.9854      3224
   macro avg     0.9854    0.9854    0.9854      3224
weighted avg     0.9854    0.9854    0.9854      3224

------


Parameters: { "use_label_encoder" } are not used.



📂 Fold 3 AUC: 0.9988
              precision    recall  f1-score   support

           0     0.9906    0.9833    0.9869      1612
           1     0.9834    0.9907    0.9870      1612

    accuracy                         0.9870      3224
   macro avg     0.9870    0.9870    0.9870      3224
weighted avg     0.9870    0.9870    0.9870      3224

------


Parameters: { "use_label_encoder" } are not used.



📂 Fold 4 AUC: 0.9981
              precision    recall  f1-score   support

           0     0.9839    0.9851    0.9845      1612
           1     0.9851    0.9839    0.9845      1612

    accuracy                         0.9845      3224
   macro avg     0.9845    0.9845    0.9845      3224
weighted avg     0.9845    0.9845    0.9845      3224

------


Parameters: { "use_label_encoder" } are not used.



📂 Fold 5 AUC: 0.9988
              precision    recall  f1-score   support

           0     0.9882    0.9864    0.9873      1612
           1     0.9864    0.9882    0.9873      1612

    accuracy                         0.9873      3224
   macro avg     0.9873    0.9873    0.9873      3224
weighted avg     0.9873    0.9873    0.9873      3224

------

✅ Mean CV ROC-AUC: 0.9983 ± 0.0004

🔒 Final Evaluation on Hold-Out Test Set...



Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0     0.7800    0.9906    0.8728      2015
           1     0.9871    0.7206    0.8330      2015

    accuracy                         0.8556      4030
   macro avg     0.8835    0.8556    0.8529      4030
weighted avg     0.8835    0.8556    0.8529      4030

🎯 Final Test ROC-AUC: 0.9793

🧪 Y-Scrambling (sanity check) on Train+Val...



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



🔀 Y-Scrambled ROC-AUC: 0.4913 ± 0.0076
👉 This should be near 0.5 if your real model learned something.
