In [None]:
import pandas as pd
import requests
from io import StringIO

# URLs from AlgPred 2.0
datasets = {
    "train_positive": (
        "https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_positive.txt",
        1,
    ),
    "train_negative": (
        "https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_negative.txt",
        0,
    ),
    "validation_positive": (
        "https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_positive.txt",
        1,
    ),
    "validation_negative": (
        "https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_negative.txt",
        0,
    ),
}

# Parse FASTA format


def parse_fasta(fasta_text, label):
    sequences = []
    current_id = None
    current_seq = ""
    for line in fasta_text.strip().splitlines():
        line = line.strip()
        if line.startswith(">"):
            if current_id is not None:
                sequences.append((current_id, current_seq, label))
            current_id = line[1:]  # remove ">"
            current_seq = ""
        else:
            current_seq += line
    if current_id and current_seq:
        sequences.append((current_id, current_seq, label))
    return sequences


# Download and parse all files
all_entries = []
for name, (url, label) in datasets.items():
    print(f"Downloading {name}...")
    response = requests.get(url)
    entries = parse_fasta(response.text, label)
    all_entries.extend(entries)

# Convert to DataFrame
df = pd.DataFrame(all_entries, columns=["id", "sequence", "label"])
df.to_csv("algpred2_cleaned.csv", index=False)
print("✅ Saved cleaned dataset to 'algpred2_cleaned.csv'")

Downloading train_positive...
Downloading train_negative...
Downloading validation_positive...
Downloading validation_negative...
✅ Saved cleaned dataset to 'algpred2_cleaned.csv'


In [None]:
!pip install fair-esm torch pandas tqdm

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
import pandas as pd
import torch
import esm
from tqdm import tqdm
import os
import csv

# Load dataset
df = pd.read_csv("algpred2_cleaned.csv")
sequences = list(df["sequence"])
labels = list(df["label"])
ids = list(df["id"])

# Load ESM-2 model
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Output files
temp_file = "algpred2_esm2_embeddings_temp.csv"
final_file = "algpred2_esm2_embeddings.csv"

# Already processed IDs (for resuming)
if os.path.exists(temp_file):
    processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
    print(
        f"🔁 Resuming from {temp_file} — {len(processed_ids)} entries already processed."
    )
else:
    processed_ids = set()

# Filter data
remaining_data = [
    (ids[i], sequences[i], labels[i])
    for i in range(len(ids))
    if ids[i] not in processed_ids
]

# Batch setup
batch_size = 1
write_header = not os.path.exists(temp_file)
feature_dim = 320  # ESM-2 T6-8M has 320-dim embeddings
fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]

print(
    f"⚙️  Extracting embeddings using ESM-2... ({len(remaining_data)} sequences remaining)"
)

with open(temp_file, mode="a", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    if write_header:
        writer.writeheader()

    for i in tqdm(range(0, len(remaining_data), batch_size)):
        batch = remaining_data[i : i + batch_size]
        batch_ids = [x[0] for x in batch]
        batch_seqs = [x[1] for x in batch]
        batch_labels = [x[2] for x in batch]

        batch_data = [(batch_ids[j], batch_seqs[j]) for j in range(len(batch_seqs))]
        _, _, batch_tokens = batch_converter(batch_data)
        batch_tokens = batch_tokens.to(device)

        with torch.no_grad():
            outputs = model(batch_tokens, repr_layers=[6])
            token_representations = outputs["representations"][6]

        rows = []
        for j, (_, seq) in enumerate(batch_data):
            representation = token_representations[j, 1 : len(seq) + 1].mean(0)
            entry = {
                "id": batch_ids[j],
                "label": batch_labels[j],
            }
            for k in range(feature_dim):
                entry[f"f{k}"] = representation[k].item()
            rows.append(entry)

        writer.writerows(rows)

# Final save (copy temp file to final)
os.replace(temp_file, final_file)
print(f"✅ Final embeddings saved to '{final_file}'")

🔁 Resuming from algpred2_esm2_embeddings_temp.csv — 9668 entries already processed.
⚙️  Extracting embeddings using ESM-2... (10482 sequences remaining)


100%|██████████| 10482/10482 [03:25<00:00, 50.88it/s]

✅ Final embeddings saved to 'algpred2_esm2_embeddings.csv'





In [None]:
import pandas as pd

df = pd.read_csv("algpred2_esm2_embeddings.csv")
print(df.head())
print(df.shape)  # Rows, features

     id  label        f0        f1        f2        f3        f4        f5  \
0  P_13      1 -0.109098 -0.185716  0.221519  0.117864  0.173010 -0.071297   
1  P_14      1 -0.130675  0.124254  0.189468  0.133837  0.307256  0.215280   
2  P_17      1 -0.047639  0.091741  0.206645  0.098816  0.142606 -0.018435   
3  P_46      1 -0.186664  0.066537  0.203491  0.166804  0.384989 -0.193995   
4  P_47      1 -0.181058  0.059647  0.199809  0.173464  0.381596 -0.214030   

         f6        f7  ...      f310      f311      f312      f313      f314  \
0  0.091311  0.057937  ...  0.112632 -0.199612 -0.013409  0.179182  0.017290   
1  0.131589 -0.020784  ...  0.039018  0.151819 -0.133527  0.132653  0.024618   
2 -0.028555  0.027641  ...  0.051962  0.149907  0.026423  0.055061  0.119498   
3 -0.038884 -0.133102  ...  0.195923 -0.055615 -0.194248  0.147510 -0.090668   
4 -0.020568 -0.122544  ...  0.231452 -0.054589 -0.196715  0.168328 -0.080225   

       f315      f316      f317      f318      f31

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.dummy import DummyClassifier
import xgboost as xgb
import random

# ====================================
# Step 1: Load Data
# ====================================
df = pd.read_csv("algpred2_esm2_embeddings.csv")

feature_cols = [f"f{i}" for i in range(256)]  # Adjust if using larger model
X = df[feature_cols].values
y = df["label"].values

# ====================================
# Step 2: Create Final Test Set (10%)
# ====================================
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)
print(f"📁 Train+Val size: {X_temp.shape}, Test size: {X_test.shape}")

# ====================================
# Step 3: Dummy Classifier Baseline (on Train+Val)
# ====================================
print("\n📉 DummyClassifier (Stratified) on Train+Val:\n")
dummy = DummyClassifier(strategy="stratified", random_state=42)
dummy_aucs = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in cv.split(X_temp, y_temp):
    dummy.fit(X_temp[train_idx], y_temp[train_idx])
    y_dummy_proba = dummy.predict_proba(X_temp[val_idx])[:, 1]
    auc = roc_auc_score(y_temp[val_idx], y_dummy_proba)
    dummy_aucs.append(auc)

print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

# ====================================
# Step 4: Cross-Validation on Train+Val (XGBoost)
# ====================================
print("\n🚀 5-Fold Cross-Validation (XGBoost) on Train+Val...\n")
xgb_aucs = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_temp, y_temp)):
    X_train, X_val = X_temp[train_idx], X_temp[val_idx]
    y_train, y_val = y_temp[train_idx], y_temp[val_idx]

    clf = xgb.XGBClassifier(
        use_label_encoder=False, eval_metric="logloss", random_state=42
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    y_proba = clf.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_proba)
    xgb_aucs.append(auc)

    print(f"📂 Fold {fold+1} AUC: {auc:.4f}")
    print(classification_report(y_val, y_pred, digits=4))
    print("------")

print(f"\n✅ Mean CV ROC-AUC: {np.mean(xgb_aucs):.4f} ± {np.std(xgb_aucs):.4f}")

# ====================================
# Step 5: Final Test Set Evaluation
# ====================================
print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
clf_final = xgb.XGBClassifier(
    use_label_encoder=False, eval_metric="logloss", random_state=42
)
clf_final.fit(X_temp, y_temp)

y_test_pred = clf_final.predict(X_test)
y_test_proba = clf_final.predict_proba(X_test)[:, 1]

test_auc = roc_auc_score(y_test, y_test_proba)
print(classification_report(y_test, y_test_pred, digits=4))
print(f"🎯 Final Test ROC-AUC: {test_auc:.4f}")

# ====================================
# Step 6: Y-Scrambling Control
# ====================================
print("\n🧪 Y-Scrambling (sanity check) on Train+Val...\n")
y_temp_scrambled = y_temp.copy()
random.seed(42)
random.shuffle(y_temp_scrambled)

scrambled_aucs = []
for train_idx, val_idx in cv.split(X_temp, y_temp_scrambled):
    X_train, X_val = X_temp[train_idx], X_temp[val_idx]
    y_train, y_val = y_temp_scrambled[train_idx], y_temp_scrambled[val_idx]

    clf_scrambled = xgb.XGBClassifier(
        use_label_encoder=False, eval_metric="logloss", random_state=42
    )
    clf_scrambled.fit(X_train, y_train)
    y_proba_scrambled = clf_scrambled.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_proba_scrambled)
    scrambled_aucs.append(auc)

print(
    f"🔀 Y-Scrambled ROC-AUC: {np.mean(scrambled_aucs):.4f} ± {np.std(scrambled_aucs):.4f}"
)
print("👉 This should be near 0.5 if your real model learned something.")

📁 Train+Val size: (18135, 256), Test size: (2015, 256)

📉 DummyClassifier (Stratified) on Train+Val:

📊 Dummy ROC-AUC: 0.4988 ± 0.0048

🚀 5-Fold Cross-Validation (XGBoost) on Train+Val...



Parameters: { "use_label_encoder" } are not used.



📂 Fold 1 AUC: 0.9955
              precision    recall  f1-score   support

           0     0.9779    0.9768    0.9774      1813
           1     0.9769    0.9779    0.9774      1814

    accuracy                         0.9774      3627
   macro avg     0.9774    0.9774    0.9774      3627
weighted avg     0.9774    0.9774    0.9774      3627

------


Parameters: { "use_label_encoder" } are not used.



📂 Fold 2 AUC: 0.9950
              precision    recall  f1-score   support

           0     0.9750    0.9680    0.9715      1813
           1     0.9683    0.9752    0.9717      1814

    accuracy                         0.9716      3627
   macro avg     0.9716    0.9716    0.9716      3627
weighted avg     0.9716    0.9716    0.9716      3627

------


Parameters: { "use_label_encoder" } are not used.



📂 Fold 3 AUC: 0.9961
              precision    recall  f1-score   support

           0     0.9778    0.9724    0.9751      1813
           1     0.9726    0.9779    0.9753      1814

    accuracy                         0.9752      3627
   macro avg     0.9752    0.9752    0.9752      3627
weighted avg     0.9752    0.9752    0.9752      3627

------


Parameters: { "use_label_encoder" } are not used.



📂 Fold 4 AUC: 0.9964
              precision    recall  f1-score   support

           0     0.9788    0.9653    0.9720      1814
           1     0.9657    0.9790    0.9723      1813

    accuracy                         0.9722      3627
   macro avg     0.9722    0.9722    0.9722      3627
weighted avg     0.9722    0.9722    0.9722      3627

------


Parameters: { "use_label_encoder" } are not used.



📂 Fold 5 AUC: 0.9951
              precision    recall  f1-score   support

           0     0.9782    0.9653    0.9717      1814
           1     0.9657    0.9785    0.9721      1813

    accuracy                         0.9719      3627
   macro avg     0.9720    0.9719    0.9719      3627
weighted avg     0.9720    0.9719    0.9719      3627

------

✅ Mean CV ROC-AUC: 0.9956 ± 0.0005

🔒 Final Evaluation on Hold-Out Test Set...



Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0     0.9735    0.9841    0.9788      1008
           1     0.9839    0.9732    0.9785      1007

    accuracy                         0.9787      2015
   macro avg     0.9787    0.9787    0.9787      2015
weighted avg     0.9787    0.9787    0.9787      2015

🎯 Final Test ROC-AUC: 0.9948

🧪 Y-Scrambling (sanity check) on Train+Val...



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



🔀 Y-Scrambled ROC-AUC: 0.5034 ± 0.0048
👉 This should be near 0.5 if your real model learned something.
