## imports

In [212]:
#esm2_env
import pandas as pd
import numpy as np
import requests
from io import StringIO
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import esm
from tqdm import tqdm
import sys
import csv
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import RidgeClassifier
import xgboost as xgb
import random
import zipfile
import io
import ipywidgets as widgets
from IPython.display import display, clear_output
import time
from contextlib import redirect_stdout, redirect_stderr
from transformers import BertModel, BertTokenizer




## User input: Dataset 

In [214]:
# IMPORTANT: 
# 1. run the import cell and this one. 
# 2. after choosing and pressing confirm in the widget run the rest of the code (run + arrow down in the cell options of the next cell)
# === Layout ===
layout = widgets.Layout(width='350px')
style = {'description_width': '150px'}

# === Dropdowns ===
dataset_selector = widgets.Dropdown(
    options=["algpred2", "algpred2_resplit", "iedb", "AllergenAI"],
    value="algpred2",
    description="Select Dataset:",
    layout=layout,
    style=style
)

transformer_selector = widgets.Dropdown(
    options=["ESM-2_320dim","ProtBert_1024dim","ProtT5_1024dim"],
    value="ESM-2_320dim",
    description="Select Transformer:",
    layout=layout,
    style=style
)

model_selector = widgets.Dropdown(
    options=["XGBoost", "FFNN","Ridge"],
    value="XGBoost",
    description="Select Model:",
    layout=layout,
    style=style
)

# === Output + Button ===
output = widgets.Output()
submit_button = widgets.Button(description="✅ Confirm Selection", button_style='success')

# === Button callback ===
def on_button_clicked(b):
    global dataset_name, transformer_name, model_name  # <-- Add this
    dataset_name = dataset_selector.value
    transformer_name = transformer_selector.value
    model_name = model_selector.value

    with output:
        clear_output()
        print("✅ Selections made!")
        print(f"Dataset: {dataset_name}")
        print(f"Transformer: {transformer_name}")
        print(f"Model: {model_name}")


submit_button.on_click(on_button_clicked)

# === Display UI ===
display(widgets.VBox([
    dataset_selector,
    transformer_selector,
    model_selector,
    submit_button,
    output
]))


VBox(children=(Dropdown(description='Select Dataset:', layout=Layout(width='350px'), options=('algpred2', 'alg…

# Data curation

## Data from Benchmark

In [200]:
# if dataset_name == "benchmark":
#     print('Benchmark dataset chosen')

#     # Define base folder and output paths
#     data_dir = dataset_name
#     os.makedirs(data_dir, exist_ok=True)

#     train_csv_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
#     test_csv_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

#     # Check for existing files
#     # Skip if files already exist
#     if os.path.exists(train_csv_path) and os.path.exists(test_csv_path):
#         print(f"⏭️  Files already exist in '{data_dir}/'. Skipping FASTA parsing and CSV generation.")
#     else:
#         # Define file-label mapping and GitHub raw URLs
#         base_url = "https://raw.githubusercontent.com/Jeffateth/AllergenPredict/7fafbea0ab1646796abe40cafb800c46ba842bda/Benchmark_dataset"

#         datasets = {
#             "train_p.fasta": (1, "train"),
#             "train_n.fasta": (0, "train"),
#             "test_p.fasta":  (1, "test"),
#             "test_n.fasta":  (0, "test")
#         }

#         # Parse FASTA format
#         def parse_fasta(fasta_text, label):
#             sequences = []
#             current_id = None
#             current_seq = ""
#             for line in fasta_text.strip().splitlines():
#                 line = line.strip()
#                 if line.startswith(">"):
#                     if current_id is not None:
#                         sequences.append((current_id, current_seq, label))
#                     current_id = line[1:]
#                     current_seq = ""
#                 else:
#                     current_seq += line
#             if current_id and current_seq:
#                 sequences.append((current_id, current_seq, label))
#             return sequences

#         # Download and parse files
#         train_entries = []
#         test_entries = []

#         for filename, (label, split) in datasets.items():
#             url = f"{base_url}/{filename}"
#             print(f"⬇️  Downloading {filename} from {url}...")
#             response = requests.get(url)
#             response.raise_for_status()  # raise an error for failed downloads

#             fasta_text = response.text
#             entries = parse_fasta(fasta_text, label)
#             if split == "train":
#                 train_entries.extend(entries)
#             else:
#                 test_entries.extend(entries)

#         # Save to CSV inside dataset-named folder
#         df_train = pd.DataFrame(train_entries, columns=["id", "sequence", "label"])
#         df_test = pd.DataFrame(test_entries, columns=["id", "sequence", "label"])

#         df_train.to_csv(train_csv_path, index=False)
#         df_test.to_csv(test_csv_path, index=False)

#         print(f"✅ Saved training set to '{train_csv_path}'")
#         print(f"✅ Saved testing set to '{test_csv_path}'")
# else: print('Benchmark dataset not chosen')

## Data from IEDB

In [201]:
if dataset_name == 'iedb':
    print('IEDB dataset chosen')

    # === CONFIG ===
    data_dir = dataset_name
    os.makedirs(data_dir, exist_ok=True)

    # --- Load original CSV ---
    url = "https://raw.githubusercontent.com/Jeffateth/AllergenPredict/b395c3276945b83ecc77513749361d6472706ca5/allergen_data_with_full_sequences.csv"
    df = pd.read_csv(url)

    # --- Prepare DataFrame ---
    df = df[["full_parent_protein_sequence", "label"]].copy()
    df.rename(columns={"full_parent_protein_sequence": "sequence"}, inplace=True)
    df["id"] = [f"seq_{i}" for i in range(len(df))]
    df = df[["id", "sequence", "label"]]

    # --- Train/test split (80/20 stratified) ---
    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

    # --- Save splits ---
    train_df.to_csv(os.path.join(data_dir, f"{dataset_name}_train.csv"), index=False)
    test_df.to_csv(os.path.join(data_dir, f"{dataset_name}_test.csv"), index=False)

    print("✅ Data loaded, split, and saved for ESM2 embedding.")
else: print('IEDB dataset not chosen')


IEDB dataset not chosen


## Data from AllergenAI 

In [202]:
if dataset_name == 'AllergenAI':
    print('AllergenAI dataset chosen')

    # === CONFIG ===
    data_dir = dataset_name
    os.makedirs(data_dir, exist_ok=True)

    train_csv_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
    test_csv_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

    # === Skip if already processed
    if os.path.exists(train_csv_path) and os.path.exists(test_csv_path):
        print(f"⏭️  Found existing train/test files in '{data_dir}/'. Skipping parsing.")
    else:
        # === Standard amino acid order (1-letter codes)
        aa_letters = list("ACDEFGHIKLMNPQRSTVWY")

        # === Map one-hot vector to amino acid letter
        onehot_to_aa = {
            tuple(1 if i == j else 0 for i in range(20)): aa
            for j, aa in enumerate(aa_letters)
        }

        def load_onehot_file(filepath, label):
            """Converts one-hot file to list of (sequence, label)"""
            data = np.loadtxt(filepath)
            sequences = []
            current = []

            for row in data:
                if np.all(row == 0):
                    if current:
                        sequences.append(("".join(current), label))
                        current = []
                else:
                    aa = onehot_to_aa.get(tuple(int(x) for x in row))
                    if aa:
                        current.append(aa)
                    else:
                        raise ValueError(f"Unknown one-hot vector: {row}")

            if current:
                sequences.append(("".join(current), label))

            return sequences

        # === Load both files ===
        positive_sequences = load_onehot_file("pos.txt", label=1)
        negative_sequences = load_onehot_file("neg.txt", label=0)

        # === Combine and format as DataFrame
        all_sequences = positive_sequences + negative_sequences
        df = pd.DataFrame(all_sequences, columns=["sequence", "label"])
        df["id"] = [f"seq_{i}" for i in range(len(df))]
        df = df[["id", "sequence", "label"]]

        # === Split into train/test (80/20 stratified)
        train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

        # === Save CSVs
        train_df.to_csv(train_csv_path, index=False)
        test_df.to_csv(test_csv_path, index=False)

        print(f"✅ Saved training set to '{train_csv_path}'")
        print(f"✅ Saved testing set to '{test_csv_path}'")
else:
    print('AllergenAI dataset not chosen')


AllergenAI dataset not chosen


## Data from AlgPred 2.0

In [203]:
if dataset_name in ["algpred2", "algpred2_resplit"]:
    print('AlgPred 2.0 dataset chosen')
    data_dir = dataset_name
    os.makedirs(data_dir, exist_ok=True)

    train_csv_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
    test_csv_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

    # Skip if both files already exist
    if os.path.exists(train_csv_path) and os.path.exists(test_csv_path):
        print("⏭️  Files already exist. Skipping download and parsing.")
    else:
        # URLs from AlgPred 2.0
        datasets = {
            "train_positive": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_positive.txt", 1, "train"),
            "train_negative": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_negative.txt", 0, "train"),
            "validation_positive": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_positive.txt", 1, "val"),
            "validation_negative": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_negative.txt", 0, "val")
        }

        def parse_fasta(fasta_text, label):
            sequences = []
            current_id = None
            current_seq = ""
            for line in fasta_text.strip().splitlines():
                line = line.strip()
                if line.startswith(">"):
                    if current_id is not None:
                        sequences.append((current_id, current_seq, label))
                    current_id = line[1:]
                    current_seq = ""
                else:
                    current_seq += line
            if current_id and current_seq:
                sequences.append((current_id, current_seq, label))
            return sequences

        train_entries = []
        val_entries = []

        for name, (url, label, split) in datasets.items():
            print(f"⬇️  Downloading {name} from {url}...")
            response = requests.get(url)
            entries = parse_fasta(response.text, label)
            if split == "train":
                train_entries.extend(entries)
            else:
                val_entries.extend(entries)

        df_train = pd.DataFrame(train_entries, columns=["id", "sequence", "label"])
        df_val = pd.DataFrame(val_entries, columns=["id", "sequence", "label"])

        df_train.to_csv(train_csv_path, index=False)
        df_val.to_csv(test_csv_path, index=False)

        print(f"✅ Saved training set to '{train_csv_path}'")
        print(f"✅ Saved validation set to '{test_csv_path}'")
else:
    print('AlgPred 2.0 dataset not chosen')


AlgPred 2.0 dataset chosen
⏭️  Files already exist. Skipping download and parsing.


## algpred 2 resplitting

In [204]:
if dataset_name == "algpred2_resplit":
    print("🔄 Resplitting AlgPred 2.0 dataset (80/20)...")

    data_dir = dataset_name
    train_csv_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
    test_csv_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

    # Load existing CSVs
    df_train = pd.read_csv(train_csv_path)
    df_test = pd.read_csv(test_csv_path)

    # Combine and shuffle
    combined_df = pd.concat([df_train, df_test], ignore_index=True)
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Resplit 80/20 stratified
    new_train_df, new_test_df = train_test_split(
        combined_df, test_size=0.2, stratify=combined_df["label"], random_state=42
    )

    # Save resplit data
    new_train_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
    new_test_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

    new_train_df.to_csv(new_train_path, index=False)
    new_test_df.to_csv(new_test_path, index=False)

    print(f"✅ New training set saved to '{new_train_path}'")
    print(f"✅ New testing set saved to '{new_test_path}'")
else:
    print("ℹ️  Not running resplit because algpred2 is not the selected dataset.")


ℹ️  Not running resplit because algpred2 is not the selected dataset.


# Embedding generation

## ESM-2 embedding extraction

In [205]:
if transformer_name == 'ESM-2_320dim':
    print('ESM-2 embedding extraction chosen')
    # === CONFIG ===
    feature_dim = 320           # ESM-2 T6-8M embedding size
    batch_size = 1              # Adjust based on memory
    data_dir = dataset_name     # All files live in a folder named after the dataset

    # --- Ensure directory exists ---
    os.makedirs(data_dir, exist_ok=True)

    # --- Construct dynamic file paths ---
    input_files = {
        "train": os.path.join(data_dir, f"{dataset_name}_train.csv"),
        "test": os.path.join(data_dir, f"{dataset_name}_test.csv")
    }

    # --- Output file paths ---
    embedding_files = {
        "train": os.path.join(data_dir, f"train_{dataset_name}_esm2_embeddings.csv"),
        "test": os.path.join(data_dir, f"test_{dataset_name}_esm2_embeddings.csv")
    }

    # Check if both embedding files exist
    if all(os.path.exists(f) for f in embedding_files.values()):
        print(f"✅ ESM2 embedding files already exist in '{data_dir}/'. Skipping embedding generation.")
    else:
        # --- Load ESM-2 model ---
        model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
        batch_converter = alphabet.get_batch_converter()
        model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        # --- Helper function ---
        def process_file(split_name, input_file):
            temp_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_esm2_embeddings_temp.csv")
            final_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_esm2_embeddings.csv")

            # Load dataset
            df = pd.read_csv(input_file)
            sequences = list(df["sequence"])
            labels = list(df["label"])
            ids = list(df["id"])

            # Resume support
            if os.path.exists(temp_file):
                processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
                print(f"🔁 Resuming {split_name} from {temp_file} — {len(processed_ids)} entries already processed.")
            else:
                processed_ids = set()

            remaining_data = [(ids[i], sequences[i], labels[i]) for i in range(len(ids)) if ids[i] not in processed_ids]

            # Output format
            fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]
            write_header = not os.path.exists(temp_file)

            print(f"⚙️  Extracting embeddings for {split_name} set... ({len(remaining_data)} sequences remaining)")

            with open(temp_file, mode="a", newline="") as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                if write_header:
                    writer.writeheader()

                for i in tqdm(range(0, len(remaining_data), batch_size)):
                    batch = remaining_data[i:i + batch_size]
                    batch_ids = [x[0] for x in batch]
                    batch_seqs = [x[1] for x in batch]
                    batch_labels = [x[2] for x in batch]

                    batch_data = [(batch_ids[j], batch_seqs[j]) for j in range(len(batch_seqs))]
                    _, _, batch_tokens = batch_converter(batch_data)
                    batch_tokens = batch_tokens.to(device)

                    with torch.no_grad():
                        outputs = model(batch_tokens, repr_layers=[6])
                        token_representations = outputs["representations"][6]

                    rows = []
                    for j, (_, seq) in enumerate(batch_data):
                        representation = token_representations[j, 1:len(seq)+1].mean(0)
                        entry = {
                            "id": batch_ids[j],
                            "label": batch_labels[j],
                        }
                        for k in range(feature_dim):
                            entry[f"f{k}"] = representation[k].item()
                        rows.append(entry)

                    writer.writerows(rows)

            # Final save
            os.replace(temp_file, final_file)
            print(f"✅ Final {split_name} embeddings saved to '{final_file}'")

        # --- Process each split ---
        for split, file in input_files.items():
            process_file(split, file)
else: print('ESM-2 embedding generation not chosen')

ESM-2 embedding generation not chosen


## ProtBert Embeddings 

In [206]:
if transformer_name == "ProtBert_1024dim":
    print('ProtBert embedding extraction chosen')

    

    # === CONFIG ===
    feature_dim = 1024
    batch_size = 1
    data_dir = dataset_name
    os.makedirs(data_dir, exist_ok=True)

    input_files = {
        "train": os.path.join(data_dir, f"{dataset_name}_train.csv"),
        "test": os.path.join(data_dir, f"{dataset_name}_test.csv")
    }

    embedding_files = {
        "train": os.path.join(data_dir, f"train_{dataset_name}_protbert_embeddings.csv"),
        "test": os.path.join(data_dir, f"test_{dataset_name}_protbert_embeddings.csv")
    }

    if all(os.path.exists(f) for f in embedding_files.values()):
        print(f"✅ ProtBert embedding files already exist in '{data_dir}/'. Skipping generation.")
    else:
        # === Load ProtBert ===
        tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
        model = BertModel.from_pretrained("Rostlab/prot_bert")
        model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        def process_file(split_name, input_file):
            temp_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_protbert_embeddings_temp.csv")
            final_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_protbert_embeddings.csv")

            df = pd.read_csv(input_file)
            sequences = list(df["sequence"])
            labels = list(df["label"])
            ids = list(df["id"])

            if os.path.exists(temp_file):
                processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
                print(f"🔁 Resuming {split_name} from {temp_file} — {len(processed_ids)} already processed.")
            else:
                processed_ids = set()

            remaining_data = [(ids[i], sequences[i], labels[i]) for i in range(len(ids)) if ids[i] not in processed_ids]

            fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]
            write_header = not os.path.exists(temp_file)

            print(f"⚙️  Extracting ProtBert embeddings for {split_name}... ({len(remaining_data)} sequences)")

            with open(temp_file, mode="a", newline="") as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                if write_header:
                    writer.writeheader()

                for i in tqdm(range(0, len(remaining_data), batch_size)):
                    batch = remaining_data[i:i + batch_size]
                    batch_ids = [x[0] for x in batch]
                    batch_seqs = [x[1] for x in batch]
                    batch_labels = [x[2] for x in batch]

                    # Preprocess for ProtBert
                    batch_seqs = [" ".join(list(seq)) for seq in batch_seqs]
                    encoded_input = tokenizer(batch_seqs, return_tensors='pt', padding=True, truncation=True)
                    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

                    with torch.no_grad():
                        output = model(**encoded_input)
                        embeddings = output.last_hidden_state.mean(dim=1)

                    rows = []
                    for j in range(len(batch)):
                        entry = {
                            "id": batch_ids[j],
                            "label": batch_labels[j],
                        }
                        for k in range(feature_dim):
                            entry[f"f{k}"] = embeddings[j][k].item()
                        rows.append(entry)

                    writer.writerows(rows)

            os.replace(temp_file, final_file)
            print(f"✅ Final {split_name} embeddings saved to '{final_file}'")

        for split, file in input_files.items():
            process_file(split, file)
else:
    print('ProtBert embedding generation not chosen')


ProtBert embedding extraction chosen


pytorch_model.bin:   6%|5         | 94.4M/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

⚙️  Extracting ProtBert embeddings for train... (16120 sequences)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 16120/16120 [8:22:02<00:00,  1.87s/it]


✅ Final train embeddings saved to 'algpred2/train_algpred2_protbert_embeddings.csv'
⚙️  Extracting ProtBert embeddings for test... (4030 sequences)


100%|██████████| 4030/4030 [4:07:01<00:00,  3.68s/it]     

✅ Final test embeddings saved to 'algpred2/test_algpred2_protbert_embeddings.csv'





## Prot T5 Embeddings

In [None]:
if transformer_name == "ProtT5_1024dim":
    print('ProtT5 embedding extraction chosen')

    # === CONFIG ===
    feature_dim = 1024
    batch_size = 1
    data_dir = dataset_name
    os.makedirs(data_dir, exist_ok=True)

    input_files = {
        "train": os.path.join(data_dir, f"{dataset_name}_train.csv"),
        "test": os.path.join(data_dir, f"{dataset_name}_test.csv")
    }

    embedding_files = {
        "train": os.path.join(data_dir, f"train_{dataset_name}_prott5_embeddings.csv"),
        "test": os.path.join(data_dir, f"test_{dataset_name}_prott5_embeddings.csv")
    }

    if all(os.path.exists(f) for f in embedding_files.values()):
        print(f"✅ ProtT5 embedding files already exist in '{data_dir}/'. Skipping generation.")
    else:
        # === Load ProtT5 Encoder ===
        from transformers import T5Tokenizer, T5EncoderModel

        tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False)
        model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")
        model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        def process_file(split_name, input_file):
            temp_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_prott5_embeddings_temp.csv")
            final_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_prott5_embeddings.csv")

            df = pd.read_csv(input_file)
            sequences = list(df["sequence"])
            labels = list(df["label"])
            ids = list(df["id"])

            if os.path.exists(temp_file):
                processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
                print(f"🔁 Resuming {split_name} from {temp_file} — {len(processed_ids)} already processed.")
            else:
                processed_ids = set()

            remaining_data = [(ids[i], sequences[i], labels[i]) for i in range(len(ids)) if ids[i] not in processed_ids]

            fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]
            write_header = not os.path.exists(temp_file)

            print(f"⚙️  Extracting ProtT5 embeddings for {split_name}... ({len(remaining_data)} sequences)")

            with open(temp_file, mode="a", newline="") as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                if write_header:
                    writer.writeheader()

                for i in tqdm(range(0, len(remaining_data), batch_size)):
                    batch = remaining_data[i:i + batch_size]
                    batch_ids = [x[0] for x in batch]
                    batch_seqs = [x[1] for x in batch]
                    batch_labels = [x[2] for x in batch]

                    # Preprocess for ProtT5
                    batch_seqs = [" ".join(list(seq)) for seq in batch_seqs]
                    encoded_input = tokenizer(batch_seqs, return_tensors='pt', padding=True, truncation=True)
                    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

                    with torch.no_grad():
                        output = model(**encoded_input)
                        embeddings = output.last_hidden_state.mean(dim=1)

                    rows = []
                    for j in range(len(batch)):
                        entry = {
                            "id": batch_ids[j],
                            "label": batch_labels[j],
                        }
                        for k in range(feature_dim):
                            entry[f"f{k}"] = embeddings[j][k].item()
                        rows.append(entry)

                    writer.writerows(rows)

            os.replace(temp_file, final_file)
            print(f"✅ Final {split_name} embeddings saved to '{final_file}'")

        for split, file in input_files.items():
            process_file(split, file)
else:
    print('ProtT5 embedding generation not chosen')


# concatanate ESM fold data with 3d DSSP data into vector

In [207]:
# Read the two DataFrames
df_original = pd.read_csv('algpred2/train_algpred2_esm2_embeddings.csv')
df_add = pd.read_csv('dssp_features.csv')

# Merge on matching protein IDs (df_original 'id' <-> df_add 'protein')
df_merged = pd.merge(df_original, df_add, left_on='id', right_on='protein', how='inner')

# (Optional) drop the redundant 'protein' column if you no longer need it
df_merged = df_merged.drop(columns=['protein'])

# Show the first few rows of the merged DataFrame
print(df_merged.shape)
df_merged

(3915, 336)


Unnamed: 0,id,label,f0,f1,f2,f3,f4,f5,f6,f7,...,psi_mean,psi_std,SS_H,SS_E,SS_G,SS_I,SS_B,SS_T,SS_S,SS_-
0,P_13,1,-0.109098,-0.185716,0.221519,0.117864,0.173010,-0.071297,0.091311,0.057937,...,67.815480,75.453660,83,139,24,0,1,57,40,218
1,P_14,1,-0.130675,0.124254,0.189468,0.133837,0.307257,0.215280,0.131589,-0.020784,...,53.922152,94.515936,37,71,3,5,0,19,8,15
2,P_17,1,-0.047639,0.091742,0.206646,0.098816,0.142606,-0.018436,-0.028555,0.027641,...,-22.113768,51.173919,109,0,6,0,0,3,7,13
3,P_46,1,-0.186663,0.066537,0.203491,0.166804,0.384989,-0.193996,-0.038884,-0.133102,...,48.047500,93.955165,32,66,4,6,0,25,11,16
4,P_47,1,-0.181057,0.059647,0.199809,0.173463,0.381596,-0.214031,-0.020569,-0.122544,...,49.051250,93.914859,32,66,4,6,0,26,9,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3910,P_2644,1,-0.135910,0.127915,0.055814,0.096571,-0.036245,0.172999,0.011741,-0.130888,...,-7.963262,63.282675,426,0,19,10,0,45,27,80
3911,P_2649,1,-0.297236,0.020432,-0.007142,-0.062785,0.202298,0.153170,0.047626,0.001273,...,10.300000,76.673221,60,8,3,0,0,11,11,16
3912,P_2655,1,-0.212389,0.150151,0.042394,0.079228,-0.026983,0.218209,0.066965,-0.115486,...,-6.842010,64.090766,423,0,14,5,0,49,27,89
3913,P_2671,1,-0.342869,0.054191,0.031891,-0.067981,0.154429,0.175938,0.044891,0.027024,...,12.296330,78.464713,60,8,0,0,2,16,6,17


# Models

## Ridge Regression

In [208]:
if model_name == 'Ridge':
    log_filename = f"{model_name}_{transformer_name}_{dataset_name}_output.txt"
    log_path = os.path.join("logs", log_filename)
    os.makedirs("logs", exist_ok=True)

    class Tee:
        def __init__(self, *streams):
            self.streams = streams
        def write(self, text):
            for stream in self.streams:
                stream.write(text)
                stream.flush()
        def flush(self):
            for stream in self.streams:
                stream.flush()

    with open(log_path, "w") as log_file:
        tee = Tee(sys.stdout, log_file)
        with redirect_stdout(tee), redirect_stderr(tee):

            print('Ridge Classifier model chosen')
            print(dataset_name, 'dataset chosen')

            data_dir = dataset_name
            embedding_files = {
                "train": os.path.join(data_dir, f"train_{dataset_name}_esm2_embeddings.csv"),
                "test": os.path.join(data_dir, f"test_{dataset_name}_esm2_embeddings.csv")
            }

            df_train = pd.read_csv(embedding_files["train"])
            df_test = pd.read_csv(embedding_files["test"])

            feature_cols = [f"f{i}" for i in range(320)]
            X_train_full = df_train[feature_cols].values
            y_train_full = df_train["label"].values
            X_test = df_test[feature_cols].values
            y_test = df_test["label"].values

            print(f"✅ Loaded: Train={X_train_full.shape}, Test={X_test.shape}")

            print("\n📉 DummyClassifier (Stratified) on Training Set (CV):\n")
            dummy = DummyClassifier(strategy="stratified", random_state=42)
            dummy_aucs = []
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            for train_idx, val_idx in cv.split(X_train_full, y_train_full):
                dummy.fit(X_train_full[train_idx], y_train_full[train_idx])
                y_dummy_proba = dummy.predict_proba(X_train_full[val_idx])[:, 1]
                auc = roc_auc_score(y_train_full[val_idx], y_dummy_proba)
                dummy_aucs.append(auc)

            print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

            print("\n🚀 5-Fold Cross-Validation (RidgeClassifier) on Training Set...\n")
            ridge_aucs = []
            for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full)):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_train_full[train_idx], y_train_full[val_idx]

                clf = RidgeClassifier()
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_val)
                auc = roc_auc_score(y_val, y_pred)
                ridge_aucs.append(auc)

                print(f"📂 Fold {fold+1} AUC: {auc:.4f}")
                print(classification_report(y_val, y_pred, digits=4))
                print("------")

            mean_auc = np.mean(ridge_aucs)
            std_auc = np.std(ridge_aucs, ddof=1)
            se_auc = std_auc / np.sqrt(len(ridge_aucs))
            print(f"\n✅ Mean CV ROC-AUC: {mean_auc:.4f} ± {std_auc:.4f} (SE = {se_auc:.4f})")

            print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
            clf_final = RidgeClassifier()
            clf_final.fit(X_train_full, y_train_full)

            y_test_pred = clf_final.predict(X_test)
            test_auc = roc_auc_score(y_test, y_test_pred)

            print(classification_report(y_test, y_test_pred, digits=4))
            print(f"🎯 Final Test ROC-AUC: {test_auc:.4f}")

            print("\n🧪 Y-Scrambling (sanity check) on Training Set...\n")
            y_scrambled = y_train_full.copy()
            random.seed(42)
            random.shuffle(y_scrambled)

            scrambled_aucs = []
            for train_idx, val_idx in cv.split(X_train_full, y_scrambled):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_scrambled[train_idx], y_scrambled[val_idx]

                clf_scrambled = RidgeClassifier()
                clf_scrambled.fit(X_train, y_train)
                y_pred_scrambled = clf_scrambled.predict(X_val)
                auc = roc_auc_score(y_val, y_pred_scrambled)
                scrambled_aucs.append(auc)

            print(f"🔀 Y-Scrambled ROC-AUC: {np.mean(scrambled_aucs):.4f} ± {np.std(scrambled_aucs):.4f}")
            print("👉 This should be near 0.5 if your real model learned something.")
else: print('Ridge model not chosen')

Ridge model not chosen


## XGBoosted

In [209]:
if model_name == 'XGBoost':
    # === Create log file path ===
    log_filename = f"{model_name}_{transformer_name}_{dataset_name}_output.txt"
    log_path = os.path.join("logs", log_filename)
    os.makedirs("logs", exist_ok=True)

    class Tee:
        def __init__(self, *streams):
            self.streams = streams

        def write(self, text):
            for stream in self.streams:
                stream.write(text)
                stream.flush()

        def flush(self):
            for stream in self.streams:
                stream.flush()

    with open(log_path, "w") as log_file:
        tee = Tee(sys.stdout, log_file)
        with redirect_stdout(tee), redirect_stderr(tee):

            print('XGBoost model chosen')
            print(dataset_name, 'dataset chosen')

            # Step 1: Load Data
            data_dir = dataset_name
            embedding_files = {
                "train": os.path.join(data_dir, f"train_{dataset_name}_esm2_embeddings.csv"),
                "test": os.path.join(data_dir, f"test_{dataset_name}_esm2_embeddings.csv")
            }

            df_train = pd.read_csv(embedding_files["train"])
            df_test = pd.read_csv(embedding_files["test"])

            feature_cols = [f"f{i}" for i in range(320)]
            X_train_full = df_train[feature_cols].values
            y_train_full = df_train["label"].values
            X_test = df_test[feature_cols].values
            y_test = df_test["label"].values

            print(f"✅ Loaded: Train={X_train_full.shape}, Test={X_test.shape}")

            # Step 2: DummyClassifier
            print("\n📉 DummyClassifier (Stratified) on Training Set (CV):\n")
            dummy = DummyClassifier(strategy="stratified", random_state=42)
            dummy_aucs = []
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            for train_idx, val_idx in cv.split(X_train_full, y_train_full):
                dummy.fit(X_train_full[train_idx], y_train_full[train_idx])
                y_dummy_proba = dummy.predict_proba(X_train_full[val_idx])[:, 1]
                auc = roc_auc_score(y_train_full[val_idx], y_dummy_proba)
                dummy_aucs.append(auc)

            print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

            # Step 3: CV XGBoost
            print("\n🚀 5-Fold Cross-Validation (XGBoost) on Training Set...\n")
            xgb_aucs = []
            for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full)):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_train_full[train_idx], y_train_full[val_idx]

                clf = xgb.XGBClassifier(eval_metric="logloss", random_state=42)
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_val)
                y_proba = clf.predict_proba(X_val)[:, 1]

                auc = roc_auc_score(y_val, y_proba)
                xgb_aucs.append(auc)

                print(f"📂 Fold {fold+1} AUC: {auc:.4f}")
                print(classification_report(y_val, y_pred, digits=4))
                print("------")

            mean_auc = np.mean(xgb_aucs)
            std_auc = np.std(xgb_aucs, ddof=1)
            se_auc = std_auc / np.sqrt(len(xgb_aucs))
            print(f"\n✅ Mean CV ROC-AUC: {mean_auc:.4f} ± {std_auc:.4f} (SE = {se_auc:.4f})")

            # Step 4: Final Test
            print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
            clf_final = xgb.XGBClassifier(eval_metric="logloss", random_state=42)
            clf_final.fit(X_train_full, y_train_full)

            y_test_pred = clf_final.predict(X_test)
            y_test_proba = clf_final.predict_proba(X_test)[:, 1]

            test_auc = roc_auc_score(y_test, y_test_proba)
            print(classification_report(y_test, y_test_pred, digits=4))
            print(f"🎯 Final Test ROC-AUC: {test_auc:.4f}")

            # Step 5: Y-Scrambling
            print("\n🧪 Y-Scrambling (sanity check) on Training Set...\n")
            y_scrambled = y_train_full.copy()
            random.seed(42)
            random.shuffle(y_scrambled)

            scrambled_aucs = []
            for train_idx, val_idx in cv.split(X_train_full, y_scrambled):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_scrambled[train_idx], y_scrambled[val_idx]

                clf_scrambled = xgb.XGBClassifier(eval_metric="logloss", random_state=42)
                clf_scrambled.fit(X_train, y_train)
                y_proba_scrambled = clf_scrambled.predict_proba(X_val)[:, 1]

                auc = roc_auc_score(y_val, y_proba_scrambled)
                scrambled_aucs.append(auc)

            print(f"🔀 Y-Scrambled ROC-AUC: {np.mean(scrambled_aucs):.4f} ± {np.std(scrambled_aucs):.4f}")
            print("👉 This should be near 0.5 if your real model learned something.")
else:
    print('XGBoosted model not chosen')


XGBoost model chosen
algpred2 dataset chosen
✅ Loaded: Train=(16120, 320), Test=(4030, 320)

📉 DummyClassifier (Stratified) on Training Set (CV):

📊 Dummy ROC-AUC: 0.4991 ± 0.0000

🚀 5-Fold Cross-Validation (XGBoost) on Training Set...

📂 Fold 1 AUC: 0.9957
              precision    recall  f1-score   support

           0     0.9818    0.9708    0.9763      1612
           1     0.9712    0.9820    0.9766      1612

    accuracy                         0.9764      3224
   macro avg     0.9765    0.9764    0.9764      3224
weighted avg     0.9765    0.9764    0.9764      3224

------
📂 Fold 2 AUC: 0.9972
              precision    recall  f1-score   support

           0     0.9801    0.9764    0.9782      1612
           1     0.9765    0.9801    0.9783      1612

    accuracy                         0.9783      3224
   macro avg     0.9783    0.9783    0.9783      3224
weighted avg     0.9783    0.9783    0.9783      3224

------
📂 Fold 3 AUC: 0.9978
              precision    recal

## FeedForwardNeuralNetwork

In [210]:
if model_name == 'FFNN':
    # Define NN model
    class ProteinMLP(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(320, 256),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(256, 128),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(128, 1),
                nn.Sigmoid()
            )

        def forward(self, x):
            return self.net(x)
    # === Create log file path ===
    log_filename = f"{model_name}_{transformer_name}_{dataset_name}_output.txt"
    log_path = os.path.join("logs", log_filename)
    os.makedirs("logs", exist_ok=True)

    class Tee:
        def __init__(self, *streams):
            self.streams = streams

        def write(self, text):
            for stream in self.streams:
                stream.write(text)
                stream.flush()

        def flush(self):
            for stream in self.streams:
                stream.flush()

    with open(log_path, "w") as log_file:
        tee = Tee(sys.stdout, log_file)
        with redirect_stdout(tee), redirect_stderr(tee):

            print('FFNN model chosen')
   
            print(dataset_name, 'dataset chosen')
   

            # ====================================
            # Step 1: Load Data
            # ====================================
            data_dir = dataset_name
            embedding_files = {
                "train": os.path.join(data_dir, f"train_{dataset_name}_esm2_embeddings.csv"),
                "test": os.path.join(data_dir, f"test_{dataset_name}_esm2_embeddings.csv")
            }

            df_train = pd.read_csv(embedding_files["train"])
            df_test = pd.read_csv(embedding_files["test"])

            feature_cols = [f"f{i}" for i in range(320)]
            X_train_full = df_train[feature_cols].values.astype(np.float32)
            y_train_full = df_train["label"].values.astype(np.float32)
            X_test = df_test[feature_cols].values.astype(np.float32)
            y_test = df_test["label"].values.astype(np.float32)

            print(f"✅ Loaded: Train={X_train_full.shape}, Test={X_test.shape}")

            # ====================================
            # Step 2: DummyClassifier Baseline
            # ====================================
            print("\n📉 DummyClassifier (Stratified) on Training Set (CV):\n")
            dummy = DummyClassifier(strategy="stratified", random_state=42)
            dummy_aucs = []
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            for train_idx, val_idx in cv.split(X_train_full, y_train_full):
                dummy.fit(X_train_full[train_idx], y_train_full[train_idx])
                y_val = y_train_full[val_idx]
                y_dummy_proba = dummy.predict_proba(X_train_full[val_idx])
                if y_dummy_proba.shape[1] == 2:
                    y_dummy_proba = y_dummy_proba[:, 1]
                else:
                    y_dummy_proba = np.zeros_like(y_val)
                auc = roc_auc_score(y_val, y_dummy_proba)
                dummy_aucs.append(auc)

            print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

            # ====================================
            # Step 3: 5-Fold CV with NN
            # ====================================
            print("\n🚀 5-Fold Cross-Validation (NN) on Training Set...\n")
            nn_aucs = []
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full)):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_train_full[train_idx], y_train_full[val_idx]

                train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train).unsqueeze(1))
                val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val).unsqueeze(1))
                train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
                val_loader = DataLoader(val_ds, batch_size=64)

                model = ProteinMLP().to(device)
                criterion = nn.BCELoss()
                optimizer = optim.Adam(model.parameters(), lr=1e-3)

                # Training loop
                model.train()
                for epoch in range(10):
                    for xb, yb in train_loader:
                        xb, yb = xb.to(device), yb.to(device)
                        optimizer.zero_grad()
                        preds = model(xb)
                        loss = criterion(preds, yb)
                        loss.backward()
                        optimizer.step()

                # Validation
                model.eval()
                all_preds = []
                with torch.no_grad():
                    for xb, _ in val_loader:
                        xb = xb.to(device)
                        preds = model(xb).cpu().numpy()
                        all_preds.extend(preds)
                preds_bin = (np.array(all_preds) > 0.5).astype(int)

                auc = roc_auc_score(y_val, np.array(all_preds))
                nn_aucs.append(auc)

                print(f"📂 Fold {fold+1} AUC: {auc:.4f}")
                print(classification_report(y_val, preds_bin, digits=4))
                print("------")
            print(f"\n✅ Mean CV ROC-AUC: {np.mean(nn_aucs):.4f} ± {np.std(nn_aucs):.4f}")

            # ====================================
            # Step 4: Final Test Set Evaluation
            # ====================================
            print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
            train_ds = TensorDataset(torch.tensor(X_train_full), torch.tensor(y_train_full).unsqueeze(1))
            test_ds = TensorDataset(torch.tensor(X_test), torch.tensor(y_test).unsqueeze(1))
            train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
            test_loader = DataLoader(test_ds, batch_size=64)

            final_model = ProteinMLP().to(device)
            optimizer = optim.Adam(final_model.parameters(), lr=1e-3)
            criterion = nn.BCELoss()

            final_model.train()
            for epoch in range(20):
                for xb, yb in train_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    optimizer.zero_grad()
                    preds = final_model(xb)
                    loss = criterion(preds, yb)
                    loss.backward()
                    optimizer.step()

            final_model.eval()
            y_test_preds, y_test_probs = [], []
            with torch.no_grad():
                for xb, _ in test_loader:
                    xb = xb.to(device)
                    probs = final_model(xb).cpu().numpy()
                    preds = (probs > 0.5).astype(int)
                    y_test_preds.extend(preds)
                    y_test_probs.extend(probs)

            test_auc = roc_auc_score(y_test, y_test_probs)
            print(classification_report(y_test, y_test_preds, digits=4))
            print(f"🎯 Final Test ROC-AUC: {test_auc:.4f}")
else: print('FFNN model not chosen')

FFNN model not chosen
