## imports

In [1]:
#esm2_env
import pandas as pd
import numpy as np
import requests
from io import StringIO
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import esm
from tqdm import tqdm
import sys
import os
import csv
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import RidgeClassifier
import xgboost as xgb
import random
import zipfile
import io
import ipywidgets as widgets
from IPython.display import display, clear_output
import time
from contextlib import redirect_stdout, redirect_stderr
from transformers import BertModel, BertTokenizer
import joblib

######################
# REQUIRED PACKAGES
# pandas
# numpy
# requests
# torch
# esm
# tqdm
# scikit-learn
# xgboost
# ipywidgets
# transformers
#####################

## User input: Dataset 

In [30]:
# IMPORTANT: 
# 1. run the import cell and this one. 
# 2. after choosing and pressing confirm in the widget run the rest of the code (run + arrow down in the cell options of the next cell)
# === Layout ===
layout = widgets.Layout(width='350px')
style = {'description_width': '150px'}

# === Dropdowns ===
dataset_selector = widgets.Dropdown(
    options=["algpred2", "algpred2_resplit", "iedb", "AllergenAI"],
    value="algpred2",
    description="Select Dataset:",
    layout=layout,
    style=style
)

transformer_selector = widgets.Dropdown(
    options=["ESM-2_320dim","ProtBert_1024dim","ProtT5_1024dim"],
    value="ESM-2_320dim",
    description="Select Transformer:",
    layout=layout,
    style=style
)

model_selector = widgets.Dropdown(
    options=["XGBoost", "FFNN","Ridge"],
    value="XGBoost",
    description="Select Model:",
    layout=layout,
    style=style
)

# === Output + Button ===
output = widgets.Output()
submit_button = widgets.Button(description="✅ Confirm Selection", button_style='success')

# === Button callback ===
def on_button_clicked(b):
    global dataset_name, transformer_name, model_name  # <-- Add this
    dataset_name = dataset_selector.value
    transformer_name = transformer_selector.value
    model_name = model_selector.value

    with output:
        clear_output()
        print("✅ Selections made!")
        print(f"Dataset: {dataset_name}")
        print(f"Transformer: {transformer_name}")
        print(f"Model: {model_name}")


submit_button.on_click(on_button_clicked)

# === Display UI ===
display(widgets.VBox([
    dataset_selector,
    transformer_selector,
    model_selector,
    submit_button,
    output
]))


VBox(children=(Dropdown(description='Select Dataset:', layout=Layout(width='350px'), options=('algpred2', 'alg…

# Data curation

## Data from IEDB

In [4]:
# if dataset_name == 'iedb':
#     print('IEDB dataset chosen')

#     # === CONFIG ===
#     data_dir = dataset_name
#     os.makedirs(data_dir, exist_ok=True)

#     # --- Load original CSV ---
#     url = "https://raw.githubusercontent.com/Jeffateth/AllergenPredict/b395c3276945b83ecc77513749361d6472706ca5/allergen_data_with_full_sequences.csv"
#     df = pd.read_csv(url)

#     # --- Prepare DataFrame ---
#     df = df[["full_parent_protein_sequence", "label"]].copy()
#     df.rename(columns={"full_parent_protein_sequence": "sequence"}, inplace=True)
#     df["id"] = [f"seq_{i}" for i in range(len(df))]
#     df = df[["id", "sequence", "label"]]

#     # --- Train/test split (80/20 stratified) ---
#     train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

#     # --- Save splits ---
#     train_df.to_csv(os.path.join(data_dir, f"{dataset_name}_train.csv"), index=False)
#     test_df.to_csv(os.path.join(data_dir, f"{dataset_name}_test.csv"), index=False)

#     print("✅ Data loaded, split, and saved for ESM2 embedding.")
# else: print('IEDB dataset not chosen')


IEDB dataset not chosen


## Data from AllergenAI 

In [5]:
# if dataset_name == 'AllergenAI':
#     print('AllergenAI dataset chosen')

#     # === CONFIG ===
#     data_dir = dataset_name
#     os.makedirs(data_dir, exist_ok=True)

#     train_csv_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
#     test_csv_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

#     # === Skip if already processed
#     if os.path.exists(train_csv_path) and os.path.exists(test_csv_path):
#         print(f"⏭️  Found existing train/test files in '{data_dir}/'. Skipping parsing.")
#     else:
#         # === Standard amino acid order (1-letter codes)
#         aa_letters = list("ACDEFGHIKLMNPQRSTVWY")

#         # === Map one-hot vector to amino acid letter
#         onehot_to_aa = {
#             tuple(1 if i == j else 0 for i in range(20)): aa
#             for j, aa in enumerate(aa_letters)
#         }

#         def load_onehot_file(filepath, label):
#             """Converts one-hot file to list of (sequence, label)"""
#             data = np.loadtxt(filepath)
#             sequences = []
#             current = []

#             for row in data:
#                 if np.all(row == 0):
#                     if current:
#                         sequences.append(("".join(current), label))
#                         current = []
#                 else:
#                     aa = onehot_to_aa.get(tuple(int(x) for x in row))
#                     if aa:
#                         current.append(aa)
#                     else:
#                         raise ValueError(f"Unknown one-hot vector: {row}")

#             if current:
#                 sequences.append(("".join(current), label))

#             return sequences

#         # === Load both files ===
#         positive_sequences = load_onehot_file("pos.txt", label=1)
#         negative_sequences = load_onehot_file("neg.txt", label=0)

#         # === Combine and format as DataFrame
#         all_sequences = positive_sequences + negative_sequences
#         df = pd.DataFrame(all_sequences, columns=["sequence", "label"])
#         df["id"] = [f"seq_{i}" for i in range(len(df))]
#         df = df[["id", "sequence", "label"]]

#         # === Split into train/test (80/20 stratified)
#         train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

#         # === Save CSVs
#         train_df.to_csv(train_csv_path, index=False)
#         test_df.to_csv(test_csv_path, index=False)

#         print(f"✅ Saved training set to '{train_csv_path}'")
#         print(f"✅ Saved testing set to '{test_csv_path}'")
# else:
#     print('AllergenAI dataset not chosen')


AllergenAI dataset not chosen


## Data from AlgPred 2.0

In [3]:
if dataset_name in ["algpred2", "algpred2_resplit"]:
    print('AlgPred 2.0 dataset chosen')
    data_dir = dataset_name
    os.makedirs(data_dir, exist_ok=True)

    train_csv_path = os.path.join(data_dir, f"{dataset_name}_train.csv")
    test_csv_path = os.path.join(data_dir, f"{dataset_name}_test.csv")

    # Skip if both files already exist
    if os.path.exists(train_csv_path) and os.path.exists(test_csv_path):
        print("⏭️  Files already exist. Skipping download and parsing.")
    else:
        # URLs from AlgPred 2.0
        datasets = {
            "train_positive": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_positive.txt", 1, "train"),
            "train_negative": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_negative.txt", 0, "train"),
            "validation_positive": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_positive.txt", 1, "val"),
            "validation_negative": ("https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_negative.txt", 0, "val")
        }

        def parse_fasta(fasta_text, label):
            sequences = []
            current_id = None
            current_seq = ""
            for line in fasta_text.strip().splitlines():
                line = line.strip()
                if line.startswith(">"):
                    if current_id is not None:
                        sequences.append((current_id, current_seq, label))
                    current_id = line[1:]
                    current_seq = ""
                else:
                    current_seq += line
            if current_id and current_seq:
                sequences.append((current_id, current_seq, label))
            return sequences

        train_entries = []
        val_entries = []

        for name, (url, label, split) in datasets.items():
            print(f"⬇️  Downloading {name} from {url}...")
            response = requests.get(url)
            entries = parse_fasta(response.text, label)
            if split == "train":
                train_entries.extend(entries)
            else:
                val_entries.extend(entries)

        df_train = pd.DataFrame(train_entries, columns=["id", "sequence", "label"])
        df_val = pd.DataFrame(val_entries, columns=["id", "sequence", "label"])

        df_train.to_csv(train_csv_path, index=False)
        df_val.to_csv(test_csv_path, index=False)

        print(f"✅ Saved training set to '{train_csv_path}'")
        print(f"✅ Saved validation set to '{test_csv_path}'")
else:
    print('AlgPred 2.0 dataset not chosen')


AlgPred 2.0 dataset chosen
⬇️  Downloading train_positive from https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_positive.txt...
⬇️  Downloading train_negative from https://webs.iiitd.edu.in/raghava/algpred2/datasets/train_negative.txt...
⬇️  Downloading validation_positive from https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_positive.txt...
⬇️  Downloading validation_negative from https://webs.iiitd.edu.in/raghava/algpred2/datasets/validation_negative.txt...
✅ Saved training set to 'algpred2/algpred2_train.csv'
✅ Saved validation set to 'algpred2/algpred2_test.csv'


# Embedding generation

## ESM-2 embedding extraction

In [6]:
if transformer_name == 'ESM-2_320dim':
    print('ESM-2 embedding extraction chosen')
    # === CONFIG ===
    feature_dim = 320           # ESM-2 T6-8M embedding size
    batch_size = 1              # Adjust based on memory
    data_dir = "/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/" + dataset_name     # All files live in a folder named after the dataset

    # --- Ensure directory exists ---
    os.makedirs(data_dir, exist_ok=True)

    # --- Construct dynamic file paths ---
    input_files = {
        "train": os.path.join(data_dir, f"{dataset_name}_train.csv"),
        "test": os.path.join(data_dir, f"{dataset_name}_test.csv")
    }

    # --- Output file paths ---
    embedding_files = {
        "train": os.path.join(data_dir, f"train_{dataset_name}_esm2_embeddings.csv"),
        "test": os.path.join(data_dir, f"test_{dataset_name}_esm2_embeddings.csv")
    }

    # Check if both embedding files exist
    if all(os.path.exists(f) for f in embedding_files.values()):
        print(f"✅ ESM2 embedding files already exist in '{data_dir}/'. Skipping embedding generation.")
    else:
        # --- Load ESM-2 model ---
        model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
        batch_converter = alphabet.get_batch_converter()
        model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        # --- Helper function ---
        def process_file(split_name, input_file):
            temp_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_esm2_embeddings_temp.csv")
            final_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_esm2_embeddings.csv")

            # Load dataset
            df = pd.read_csv(input_file)
            sequences = list(df["sequence"])
            labels = list(df["label"])
            ids = list(df["id"])

            # Resume support
            if os.path.exists(temp_file):
                processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
                print(f"🔁 Resuming {split_name} from {temp_file} — {len(processed_ids)} entries already processed.")
            else:
                processed_ids = set()

            remaining_data = [(ids[i], sequences[i], labels[i]) for i in range(len(ids)) if ids[i] not in processed_ids]

            # Output format
            fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]
            write_header = not os.path.exists(temp_file)

            print(f"⚙️  Extracting embeddings for {split_name} set... ({len(remaining_data)} sequences remaining)")

            with open(temp_file, mode="a", newline="") as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                if write_header:
                    writer.writeheader()

                for i in tqdm(range(0, len(remaining_data), batch_size)):
                    batch = remaining_data[i:i + batch_size]
                    batch_ids = [x[0] for x in batch]
                    batch_seqs = [x[1] for x in batch]
                    batch_labels = [x[2] for x in batch]

                    batch_data = [(batch_ids[j], batch_seqs[j]) for j in range(len(batch_seqs))]
                    _, _, batch_tokens = batch_converter(batch_data)
                    batch_tokens = batch_tokens.to(device)

                    with torch.no_grad():
                        outputs = model(batch_tokens, repr_layers=[6])
                        token_representations = outputs["representations"][6]

                    rows = []
                    for j, (_, seq) in enumerate(batch_data):
                        representation = token_representations[j, 1:len(seq)+1].mean(0)
                        entry = {
                            "id": batch_ids[j],
                            "label": batch_labels[j],
                        }
                        for k in range(feature_dim):
                            entry[f"f{k}"] = representation[k].item()
                        rows.append(entry)

                    writer.writerows(rows)

            # Final save
            os.replace(temp_file, final_file)
            print(f"✅ Final {split_name} embeddings saved to '{final_file}'")

        # --- Process each split ---
        for split, file in input_files.items():
            process_file(split, file)
else: print('ESM-2 embedding generation not chosen')

ESM-2 embedding extraction chosen
✅ ESM2 embedding files already exist in '/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/algpred2/'. Skipping embedding generation.


## ProtBert Embeddings 

In [22]:
if transformer_name == "ProtBert_1024dim":
    print('ProtBert embedding extraction chosen')

    

    # === CONFIG ===
    feature_dim = 1024
    batch_size = 1
    data_dir = "/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/" + dataset_name
    os.makedirs(data_dir, exist_ok=True)

    input_files = {
        "train": os.path.join(data_dir, f"{dataset_name}_train.csv"),
        "test": os.path.join(data_dir, f"{dataset_name}_test.csv")
    }

    embedding_files = {
        "train": os.path.join(data_dir, f"train_{dataset_name}_protbert_embeddings.csv"),
        "test": os.path.join(data_dir, f"test_{dataset_name}_protbert_embeddings.csv")
    }

    if all(os.path.exists(f) for f in embedding_files.values()):
        print(f"✅ ProtBert embedding files already exist in '{data_dir}/'. Skipping generation.")
    else:
        # === Load ProtBert ===
        tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
        model = BertModel.from_pretrained("Rostlab/prot_bert")
        model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        def process_file(split_name, input_file):
            temp_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_protbert_embeddings_temp.csv")
            final_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_protbert_embeddings.csv")

            df = pd.read_csv(input_file)
            sequences = list(df["sequence"])
            labels = list(df["label"])
            ids = list(df["id"])

            if os.path.exists(temp_file):
                processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
                print(f"🔁 Resuming {split_name} from {temp_file} — {len(processed_ids)} already processed.")
            else:
                processed_ids = set()

            remaining_data = [(ids[i], sequences[i], labels[i]) for i in range(len(ids)) if ids[i] not in processed_ids]

            fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]
            write_header = not os.path.exists(temp_file)

            print(f"⚙️  Extracting ProtBert embeddings for {split_name}... ({len(remaining_data)} sequences)")

            with open(temp_file, mode="a", newline="") as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                if write_header:
                    writer.writeheader()

                for i in tqdm(range(0, len(remaining_data), batch_size)):
                    batch = remaining_data[i:i + batch_size]
                    batch_ids = [x[0] for x in batch]
                    batch_seqs = [x[1] for x in batch]
                    batch_labels = [x[2] for x in batch]

                    # Preprocess for ProtBert
                    batch_seqs = [" ".join(list(seq)) for seq in batch_seqs]
                    encoded_input = tokenizer(batch_seqs, return_tensors='pt', padding=True, truncation=True)
                    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

                    with torch.no_grad():
                        output = model(**encoded_input)
                        embeddings = output.last_hidden_state.mean(dim=1)

                    rows = []
                    for j in range(len(batch)):
                        entry = {
                            "id": batch_ids[j],
                            "label": batch_labels[j],
                        }
                        for k in range(feature_dim):
                            entry[f"f{k}"] = embeddings[j][k].item()
                        rows.append(entry)

                    writer.writerows(rows)

            os.replace(temp_file, final_file)
            print(f"✅ Final {split_name} embeddings saved to '{final_file}'")

        for split, file in input_files.items():
            process_file(split, file)
else:
    print('ProtBert embedding generation not chosen')


ProtBert embedding extraction chosen
✅ ProtBert embedding files already exist in '/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/algpred2/'. Skipping generation.


## Prot T5 Embeddings

In [12]:
# if transformer_name == "ProtT5_1024dim":
#     print('ProtT5 embedding extraction chosen')

#     # === CONFIG ===
#     feature_dim = 1024
#     batch_size = 1
#     data_dir = dataset_name
#     os.makedirs(data_dir, exist_ok=True)

#     input_files = {
#         "train": os.path.join(data_dir, f"{dataset_name}_train.csv"),
#         "test": os.path.join(data_dir, f"{dataset_name}_test.csv")
#     }

#     embedding_files = {
#         "train": os.path.join(data_dir, f"train_{dataset_name}_{transformer_name}_embeddings.csv"),
#         "test": os.path.join(data_dir, f"test_{dataset_name}_{transformer_name}_embeddings.csv")
#     }

#     if all(os.path.exists(f) for f in embedding_files.values()):
#         print(f"✅ ProtT5 embedding files already exist in '{data_dir}/'. Skipping generation.")
#     else:
#         # === Load ProtT5 Encoder ===
#         from transformers import T5Tokenizer, T5EncoderModel

#         tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False)
#         model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")
#         model.eval()
#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         model = model.to(device)

#         def process_file(split_name, input_file):
#             temp_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_prott5_embeddings_temp.csv")
#             final_file = os.path.join(data_dir, f"{split_name}_{dataset_name}_prott5_embeddings.csv")

#             df = pd.read_csv(input_file)
#             sequences = list(df["sequence"])
#             labels = list(df["label"])
#             ids = list(df["id"])

#             if os.path.exists(temp_file):
#                 processed_ids = set(pd.read_csv(temp_file, usecols=["id"])["id"])
#                 print(f"🔁 Resuming {split_name} from {temp_file} — {len(processed_ids)} already processed.")
#             else:
#                 processed_ids = set()

#             remaining_data = [(ids[i], sequences[i], labels[i]) for i in range(len(ids)) if ids[i] not in processed_ids]

#             fieldnames = ["id", "label"] + [f"f{k}" for k in range(feature_dim)]
#             write_header = not os.path.exists(temp_file)

#             print(f"⚙️  Extracting ProtT5 embeddings for {split_name}... ({len(remaining_data)} sequences)")

#             with open(temp_file, mode="a", newline="") as f:
#                 writer = csv.DictWriter(f, fieldnames=fieldnames)
#                 if write_header:
#                     writer.writeheader()

#                 for i in tqdm(range(0, len(remaining_data), batch_size)):
#                     batch = remaining_data[i:i + batch_size]
#                     batch_ids = [x[0] for x in batch]
#                     batch_seqs = [x[1] for x in batch]
#                     batch_labels = [x[2] for x in batch]

#                     # Preprocess for ProtT5
#                     batch_seqs = [" ".join(list(seq)) for seq in batch_seqs]
#                     encoded_input = tokenizer(batch_seqs, return_tensors='pt', padding=True, truncation=True)
#                     encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

#                     with torch.no_grad():
#                         output = model(**encoded_input)
#                         embeddings = output.last_hidden_state.mean(dim=1)

#                     rows = []
#                     for j in range(len(batch)):
#                         entry = {
#                             "id": batch_ids[j],
#                             "label": batch_labels[j],
#                         }
#                         for k in range(feature_dim):
#                             entry[f"f{k}"] = embeddings[j][k].item()
#                         rows.append(entry)

#                     writer.writerows(rows)

#             os.replace(temp_file, final_file)
#             print(f"✅ Final {split_name} embeddings saved to '{final_file}'")

#         for split, file in input_files.items():
#             process_file(split, file)
# else:
#     print('ProtT5 embedding generation not chosen')


ProtT5 embedding generation not chosen


# concatanate ESM fold data with 3d DSSP data into vector

In [29]:
import pandas as pd

# === TRAIN SET ===
# Load original training data and DSSP features
df_train_original = pd.read_csv('/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/algpred2/train_algpred2_protbert_embeddings.csv')
df_train_dssp = pd.read_csv('/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/dssp_train_features.csv')

# Merge on matching protein IDs
df_train_merged = pd.merge(df_train_original, df_train_dssp, left_on='id', right_on='protein', how='inner')
df_train_merged = df_train_merged.drop(columns=['protein'])

# Save merged training data
train_output_path = '/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/algpred2/train_merged_with_dssp_protbert_embeddings.csv'
df_train_merged.to_csv(train_output_path, index=False)
print(f"✅ Merged train set saved to: {train_output_path}")
print("Train set shape after merge:", df_train_merged.shape)


# === TEST SET ===
# Load original test data and DSSP features
df_test_original = pd.read_csv('/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/algpred2/test_algpred2_protbert_embeddings.csv')
df_test_dssp = pd.read_csv('/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/dssp_test_features.csv')

# Merge on matching protein IDs
df_test_merged = pd.merge(df_test_original, df_test_dssp, left_on='id', right_on='protein', how='inner')
df_test_merged = df_test_merged.drop(columns=['protein'])

# Save merged test data
test_output_path = '/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/algpred2/test_merged_with_dssp_protbert_embeddings.csv'
df_test_merged.to_csv(test_output_path, index=False)
print(f"✅ Merged test set saved to: {test_output_path}")
print("Test set shape after merge:", df_test_merged.shape)


✅ Merged train set saved to: /Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/algpred2/train_merged_with_dssp_protbert_embeddings.csv
Train set shape after merge: (15311, 1040)
✅ Merged test set saved to: /Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/algpred2/test_merged_with_dssp_protbert_embeddings.csv
Test set shape after merge: (3946, 1040)


# Models

## Ridge Regression

In [15]:
model_name = "Ridge"
if model_name == 'Ridge':
    log_filename = f"{model_name}_{transformer_name}_{dataset_name}_dssp_output.txt"
    log_path = os.path.join("/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/logs", log_filename)
    os.makedirs("/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/logs", exist_ok=True)

    class Tee:
        def __init__(self, *streams):
            self.streams = streams
        def write(self, text):
            for stream in self.streams:
                stream.write(text)
                stream.flush()
        def flush(self):
            for stream in self.streams:
                stream.flush()

    with open(log_path, "w") as log_file:
        tee = Tee(sys.stdout, log_file)
        with redirect_stdout(tee), redirect_stderr(tee):

            print('Ridge Classifier model chosen')
            print(dataset_name, 'dataset chosen')

            data_dir = "/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/" + dataset_name
            embedding_files = {
                "train": os.path.join(data_dir, f"train_merged_with_dssp_ESM-2_320dim.csv"),#train_{dataset_name}_esm2_embeddings.csv
                "test": os.path.join(data_dir, f"test_merged_with_dssp_ESM-2_320dim.csv")#test_{dataset_name}_esm2_embeddings.csv
            }

            df_train = pd.read_csv(embedding_files["train"])
            df_test = pd.read_csv(embedding_files["test"])

            feature_cols = [f"f{i}" for i in range(320)]
            X_train_full = df_train[feature_cols].values
            y_train_full = df_train["label"].values
            X_test = df_test[feature_cols].values
            y_test = df_test["label"].values

            print(f"✅ Loaded: Train={X_train_full.shape}, Test={X_test.shape}")

            print("\n📉 DummyClassifier (Stratified) on Training Set (CV):\n")
            dummy = DummyClassifier(strategy="stratified", random_state=42)
            dummy_aucs = []
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            for train_idx, val_idx in cv.split(X_train_full, y_train_full):
                dummy.fit(X_train_full[train_idx], y_train_full[train_idx])
                y_dummy_proba = dummy.predict_proba(X_train_full[val_idx])[:, 1]
                auc = roc_auc_score(y_train_full[val_idx], y_dummy_proba)
                dummy_aucs.append(auc)

            print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

            print("\n🚀 5-Fold Cross-Validation (RidgeClassifier) on Training Set...\n")
            from sklearn.metrics import (
                roc_auc_score, accuracy_score, recall_score, confusion_matrix, matthews_corrcoef
            )

            def evaluate_model(y_true, y_pred):
                acc = accuracy_score(y_true, y_pred)
                sens = recall_score(y_true, y_pred, pos_label=1)
                spec = recall_score(y_true, y_pred, pos_label=0)
                auc = roc_auc_score(y_true, y_pred)
                mcc = matthews_corrcoef(y_true, y_pred)
                return acc, sens, spec, auc, mcc

            # 5-Fold Cross-Validation with RidgeClassifier
            print("\n🚀 5-Fold Cross-Validation (RidgeClassifier) on Training Set...\n")
            ridge_aucs = []
            for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full)):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_train_full[train_idx], y_train_full[val_idx]

                clf = RidgeClassifier()
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_val)
                acc, sens, spec, auc, mcc = evaluate_model(y_val, y_pred)
                ridge_aucs.append(auc)

                print(f"📂 Fold {fold+1} Metrics:")
                print(f" - Accuracy     : {acc:.4f}")
                print(f" - Sensitivity  : {sens:.4f}")
                print(f" - Specificity  : {spec:.4f}")
                print(f" - ROC-AUC      : {auc:.4f}")
                print(f" - MCC          : {mcc:.4f}")
                print("------")

            mean_auc = np.mean(ridge_aucs)
            std_auc = np.std(ridge_aucs, ddof=1)
            se_auc = std_auc / np.sqrt(len(ridge_aucs))
            print(f"\n✅ Mean CV ROC-AUC: {mean_auc:.4f} ± {std_auc:.4f} (SE = {se_auc:.4f})")

            # Final Evaluation on Test Set
            print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
            clf_final = RidgeClassifier()
            clf_final.fit(X_train_full, y_train_full)

            y_test_pred = clf_final.predict(X_test)
            acc, sens, spec, test_auc, mcc = evaluate_model(y_test, y_test_pred)

            print("🧪 Final Test Set Metrics:")
            print(f" - Accuracy     : {acc:.4f}")
            print(f" - Sensitivity  : {sens:.4f}")
            print(f" - Specificity  : {spec:.4f}")
            print(f" - ROC-AUC      : {test_auc:.4f}")
            print(f" - MCC          : {mcc:.4f}")

            print("\n🧾 Confusion Matrix on Hold-Out Test Set:\n")
            cm = confusion_matrix(y_test, y_test_pred)
            print(cm)


            print("\n🧪 Y-Scrambling (sanity check) on Training Set...\n")
            y_scrambled = y_train_full.copy()
            random.seed(42)
            random.shuffle(y_scrambled)

            scrambled_aucs = []
            for train_idx, val_idx in cv.split(X_train_full, y_scrambled):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_scrambled[train_idx], y_scrambled[val_idx]

                clf_scrambled = RidgeClassifier()
                clf_scrambled.fit(X_train, y_train)
                y_pred_scrambled = clf_scrambled.predict(X_val)
                auc = roc_auc_score(y_val, y_pred_scrambled)
                scrambled_aucs.append(auc)

            print(f"🔀 Y-Scrambled ROC-AUC: {np.mean(scrambled_aucs):.4f} ± {np.std(scrambled_aucs):.4f}")
            print("👉 This should be near 0.5 if your real model learned something.")
else: print('Ridge model not chosen')

Ridge Classifier model chosen
algpred2 dataset chosen
✅ Loaded: Train=(16120, 320), Test=(4030, 320)

📉 DummyClassifier (Stratified) on Training Set (CV):

📊 Dummy ROC-AUC: 0.4991 ± 0.0000

🚀 5-Fold Cross-Validation (RidgeClassifier) on Training Set...


🚀 5-Fold Cross-Validation (RidgeClassifier) on Training Set...

📂 Fold 1 Metrics:
 - Accuracy     : 0.9395
 - Sensitivity  : 0.9479
 - Specificity  : 0.9311
 - ROC-AUC      : 0.9395
 - MCC          : 0.8792
------
📂 Fold 2 Metrics:
 - Accuracy     : 0.9336
 - Sensitivity  : 0.9367
 - Specificity  : 0.9305
 - ROC-AUC      : 0.9336
 - MCC          : 0.8673
------
📂 Fold 3 Metrics:
 - Accuracy     : 0.9423
 - Sensitivity  : 0.9584
 - Specificity  : 0.9262
 - ROC-AUC      : 0.9423
 - MCC          : 0.8851
------
📂 Fold 4 Metrics:
 - Accuracy     : 0.9414
 - Sensitivity  : 0.9535
 - Specificity  : 0.9293
 - ROC-AUC      : 0.9414
 - MCC          : 0.8830
------
📂 Fold 5 Metrics:
 - Accuracy     : 0.9361
 - Sensitivity  : 0.9498
 - Specificit

In [23]:
# Confusion matrix values
TP = 1580  # True Positives
FP = 136    # False Positives
FN = 435   # False Negatives
TN = 1879  # True Negatives (not used for F1, but included for completeness)

# Calculate Precision and Recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)

# Calculate F1 Score
f1_score = 2 * (precision * recall) / (precision + recall)

# Print results
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Precision: 0.9207
Recall (Sensitivity): 0.7841
F1 Score: 0.8470


## XGBoosted

In [32]:
import os
import sys
import numpy as np
import pandas as pd
from contextlib import redirect_stdout, redirect_stderr
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import (
    roc_auc_score, accuracy_score, recall_score,
    confusion_matrix, matthews_corrcoef, f1_score, precision_score
)
from sklearn.dummy import DummyClassifier
import joblib
import xgboost as xgb
import random

# --- Parameters for grid search (edit as needed) ---
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

model_name = "XGBoost"
if model_name == 'XGBoost':
    # === Create log file path ===
    log_filename = f"{model_name}_{transformer_name}_{dataset_name}_dssp_output.txt"
    log_path = os.path.join("/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/logs", log_filename)
    os.makedirs("/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/logs", exist_ok=True)

    class Tee:
        def __init__(self, *streams):
            self.streams = streams
        def write(self, text):
            for stream in self.streams:
                stream.write(text)
                stream.flush()
        def flush(self):
            for stream in self.streams:
                stream.flush()

    def evaluate_model(y_true, y_pred, y_proba):
        acc = accuracy_score(y_true, y_pred)
        sens = recall_score(y_true, y_pred, pos_label=1)
        spec = recall_score(y_true, y_pred, pos_label=0)
        auc = roc_auc_score(y_true, y_proba)
        mcc = matthews_corrcoef(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred)
        return acc, sens, spec, auc, mcc, f1, prec

    with open(log_path, "w") as log_file:
        tee = Tee(sys.stdout, log_file)
        with redirect_stdout(tee), redirect_stderr(tee):

            print('XGBoost model chosen')
            print(dataset_name, 'dataset chosen')

            # Step 1: Load Data
            data_dir = "/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/" + dataset_name
            embedding_files = {
                "train": os.path.join(data_dir, f"train_merged_with_dssp_protbert_embeddings.csv"),#train_{dataset_name}_esm2_embeddings.csv
                "test": os.path.join(data_dir, f"test_merged_with_dssp_protbert_embeddings.csv")#test_{dataset_name}_esm2_embeddings.csv
            }
            df_train = pd.read_csv(embedding_files["train"])
            df_test = pd.read_csv(embedding_files["test"])

            # All columns from index 2 onward are features
            X_train_full = df_train.iloc[:, 2:].values
            y_train_full = df_train["label"].values
            X_test = df_test.iloc[:, 2:].values
            y_test = df_test["label"].values

            print(f"✅ Loaded: Train={X_train_full.shape}, Test={X_test.shape}")

            # Step 2: DummyClassifier
            print("\n📉 DummyClassifier (Stratified) on Training Set (CV):\n")
            dummy = DummyClassifier(strategy="stratified", random_state=42)
            dummy_aucs = []
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            for train_idx, val_idx in cv.split(X_train_full, y_train_full):
                dummy.fit(X_train_full[train_idx], y_train_full[train_idx])
                y_dummy_proba = dummy.predict_proba(X_train_full[val_idx])[:, 1]
                auc = roc_auc_score(y_train_full[val_idx], y_dummy_proba)
                dummy_aucs.append(auc)
            print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

            # Step 3: Hyperparameter tuning with GridSearchCV
            print("\n🔍 Hyperparameter Tuning with GridSearchCV...\n")
            base_clf = xgb.XGBClassifier(eval_metric="logloss", random_state=42)
            grid_search = GridSearchCV(
                estimator=base_clf,
                param_grid=param_grid,
                scoring='roc_auc',
                cv=cv,
                n_jobs=-1,
                verbose=1
            )
            grid_search.fit(X_train_full, y_train_full)
            print(f"\n🏆 Best Parameters: {grid_search.best_params_}")
            print(f"🏆 Best CV ROC-AUC: {grid_search.best_score_:.4f}")

            # Step 4: Final Test
            print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
            clf_final = xgb.XGBClassifier(**grid_search.best_params_, eval_metric="logloss", random_state=42)
            clf_final.fit(X_train_full, y_train_full)

            # Save the trained XGBoost model
            model_path = f"{model_name}_{transformer_name}_{dataset_name}_xgboost_model.pkl"
            joblib.dump(clf_final, model_path)
            print(f"✅ Model saved to: {model_path}")

            y_test_pred = clf_final.predict(X_test)
            y_test_proba = clf_final.predict_proba(X_test)[:, 1]

            acc, sens, spec, test_auc, mcc, f1, prec = evaluate_model(y_test, y_test_pred, y_test_proba)
            print("🧪 Final Test Set Metrics:")
            print(f" - Accuracy     : {acc:.4f}")
            print(f" - Sensitivity  : {sens:.4f}")
            print(f" - Specificity  : {spec:.4f}")
            print(f" - ROC-AUC      : {test_auc:.4f}")
            print(f" - MCC          : {mcc:.4f}")
            print(f" - F1-Score     : {f1:.4f}")
            print(f" - Precision    : {prec:.4f}")

            print("\n🧾 Confusion Matrix on Hold-Out Test Set:\n")
            cm = confusion_matrix(y_test, y_test_pred)
            cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
            print(cm_df)

            # Step 5: Y-Scrambling (sanity check)
            print("\n🧪 Y-Scrambling (sanity check) on Training Set...\n")
            y_scrambled = y_train_full.copy()
            random.seed(42)
            random.shuffle(y_scrambled)
            scrambled_aucs = []
            for train_idx, val_idx in cv.split(X_train_full, y_scrambled):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_scrambled[train_idx], y_scrambled[val_idx]

                clf_scrambled = xgb.XGBClassifier(**grid_search.best_params_, eval_metric="logloss", random_state=42)
                clf_scrambled.fit(X_train, y_train)
                y_proba_scrambled = clf_scrambled.predict_proba(X_val)[:, 1]
                auc = roc_auc_score(y_val, y_proba_scrambled)
                scrambled_aucs.append(auc)
            print(f"🔀 Y-Scrambled ROC-AUC: {np.mean(scrambled_aucs):.4f} ± {np.std(scrambled_aucs):.4f}")
            print("👉 This should be near 0.5 if your real model learned something.")
else:
    print('XGBoosted model not chosen')


XGBoost model chosen
algpred2 dataset chosen
✅ Loaded: Train=(15311, 1038), Test=(3946, 1038)

📉 DummyClassifier (Stratified) on Training Set (CV):

📊 Dummy ROC-AUC: 0.4936 ± 0.0001

🔍 Hyperparameter Tuning with GridSearchCV...

Fitting 5 folds for each of 72 candidates, totalling 360 fits

🏆 Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}
🏆 Best CV ROC-AUC: 0.9985

🔒 Final Evaluation on Hold-Out Test Set...

✅ Model saved to: XGBoost_ProtBert_1024dim_algpred2_xgboost_model.pkl
🧪 Final Test Set Metrics:
 - Accuracy     : 0.8160
 - Sensitivity  : 0.6526
 - Specificity  : 0.9860
 - ROC-AUC      : 0.9548
 - MCC          : 0.6741
 - F1-Score     : 0.7834
 - Precision    : 0.9799

🧾 Confusion Matrix on Hold-Out Test Set:

          Predicted 0  Predicted 1
Actual 0         1907           27
Actual 1          699         1313

🧪 Y-Scrambling (sanity check) on Training Set...

🔀 Y-Scrambled ROC-AUC: 0.5030 ± 0.0074
👉 This

In [18]:
# Confusion matrix values
TP = 1318  # True Positives
FP = 39    # False Positives
FN = 697   # False Negatives
TN = 1976  # True Negatives (not used for F1, but included for completeness)

# Calculate Precision and Recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)

# Calculate F1 Score
f1_score = 2 * (precision * recall) / (precision + recall)

# Print results
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Precision: 0.9713
Recall (Sensitivity): 0.6541
F1 Score: 0.7817


## FeedForwardNeuralNetwork

In [16]:
model_name = "FFNN"
if model_name == 'FFNN':
    # Define NN model
    class ProteinMLP(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(320, 256),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(256, 128),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(128, 1),
                nn.Sigmoid()
            )

        def forward(self, x):
            return self.net(x)
    # === Create log file path ===
    log_filename = f"{model_name}_{transformer_name}_{dataset_name}_output.txt"
    log_path = os.path.join("/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/logs", log_filename)
    os.makedirs("/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/logs", exist_ok=True)
    from sklearn.metrics import (
    roc_auc_score, accuracy_score, recall_score, confusion_matrix, matthews_corrcoef
)

    def evaluate_model(y_true, y_pred, y_proba):
        acc = accuracy_score(y_true, y_pred)
        sens = recall_score(y_true, y_pred, pos_label=1)
        spec = recall_score(y_true, y_pred, pos_label=0)
        auc = roc_auc_score(y_true, y_proba)
        mcc = matthews_corrcoef(y_true, y_pred)
        return acc, sens, spec, auc, mcc

    class Tee:
        def __init__(self, *streams):
            self.streams = streams

        def write(self, text):
            for stream in self.streams:
                stream.write(text)
                stream.flush()

        def flush(self):
            for stream in self.streams:
                stream.flush()

    with open(log_path, "w") as log_file:
        tee = Tee(sys.stdout, log_file)
        with redirect_stdout(tee), redirect_stderr(tee):

            print('FFNN model chosen')
   
            print(dataset_name, 'dataset chosen')
   

            # ====================================
            # Step 1: Load Data
            # ====================================
            data_dir = "/Users/rikardpettersson/Library/Mobile Documents/com~apple~CloudDocs/Documents/ETH Chemistry Ms/Digital Chemistry/" + dataset_name
            embedding_files = {
                "train": os.path.join(data_dir, f"train_{dataset_name}_esm2_embeddings.csv"),
                "test": os.path.join(data_dir, f"test_{dataset_name}_esm2_embeddings.csv")
            }

            df_train = pd.read_csv(embedding_files["train"])
            df_test = pd.read_csv(embedding_files["test"])

            feature_cols = [f"f{i}" for i in range(320)]
            X_train_full = df_train[feature_cols].values.astype(np.float32)
            y_train_full = df_train["label"].values.astype(np.float32)
            X_test = df_test[feature_cols].values.astype(np.float32)
            y_test = df_test["label"].values.astype(np.float32)

            print(f"✅ Loaded: Train={X_train_full.shape}, Test={X_test.shape}")

            # ====================================
            # Step 2: DummyClassifier Baseline
            # ====================================
            print("\n📉 DummyClassifier (Stratified) on Training Set (CV):\n")
            dummy = DummyClassifier(strategy="stratified", random_state=42)
            dummy_aucs = []
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            for train_idx, val_idx in cv.split(X_train_full, y_train_full):
                dummy.fit(X_train_full[train_idx], y_train_full[train_idx])
                y_val = y_train_full[val_idx]
                y_dummy_proba = dummy.predict_proba(X_train_full[val_idx])
                if y_dummy_proba.shape[1] == 2:
                    y_dummy_proba = y_dummy_proba[:, 1]
                else:
                    y_dummy_proba = np.zeros_like(y_val)
                auc = roc_auc_score(y_val, y_dummy_proba)
                dummy_aucs.append(auc)

            print(f"📊 Dummy ROC-AUC: {np.mean(dummy_aucs):.4f} ± {np.std(dummy_aucs):.4f}")

            # ====================================
            # Step 3: 5-Fold CV with NN
            # ====================================
            print("\n🚀 5-Fold Cross-Validation (NN) on Training Set...\n")
            nn_aucs = []
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full)):
                X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
                y_train, y_val = y_train_full[train_idx], y_train_full[val_idx]

                train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train).unsqueeze(1))
                val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val).unsqueeze(1))
                train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
                val_loader = DataLoader(val_ds, batch_size=64)

                model = ProteinMLP().to(device)
                criterion = nn.BCELoss()
                optimizer = optim.Adam(model.parameters(), lr=1e-3)

                # Training loop
                model.train()
                for epoch in range(10):
                    for xb, yb in train_loader:
                        xb, yb = xb.to(device), yb.to(device)
                        optimizer.zero_grad()
                        preds = model(xb)
                        loss = criterion(preds, yb)
                        loss.backward()
                        optimizer.step()

                # Validation
                model.eval()
                all_preds = []
                with torch.no_grad():
                    for xb, _ in val_loader:
                        xb = xb.to(device)
                        preds = model(xb).cpu().numpy()
                        all_preds.extend(preds)
                preds_bin = (np.array(all_preds) > 0.5).astype(int)

                all_preds_array = np.array(all_preds).reshape(-1)
                preds_bin = (all_preds_array > 0.5).astype(int)

                acc, sens, spec, auc, mcc = evaluate_model(y_val, preds_bin, all_preds_array)
                nn_aucs.append(auc)

                print(f"📂 Fold {fold+1} Metrics:")
                print(f" - Accuracy     : {acc:.4f}")
                print(f" - Sensitivity  : {sens:.4f}")
                print(f" - Specificity  : {spec:.4f}")
                print(f" - ROC-AUC      : {auc:.4f}")
                print(f" - MCC          : {mcc:.4f}")
                print("------")

            print(f"\n✅ Mean CV ROC-AUC: {np.mean(nn_aucs):.4f} ± {np.std(nn_aucs):.4f}")

            # ====================================
            # Step 4: Final Test Set Evaluation
            # ====================================
            print("\n🔒 Final Evaluation on Hold-Out Test Set...\n")
            train_ds = TensorDataset(torch.tensor(X_train_full), torch.tensor(y_train_full).unsqueeze(1))
            test_ds = TensorDataset(torch.tensor(X_test), torch.tensor(y_test).unsqueeze(1))
            train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
            test_loader = DataLoader(test_ds, batch_size=64)

            final_model = ProteinMLP().to(device)
            optimizer = optim.Adam(final_model.parameters(), lr=1e-3)
            criterion = nn.BCELoss()

            final_model.train()
            for epoch in range(20):
                for xb, yb in train_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    optimizer.zero_grad()
                    preds = final_model(xb)
                    loss = criterion(preds, yb)
                    loss.backward()
                    optimizer.step()

            final_model.eval()
            y_test_preds, y_test_probs = [], []
            with torch.no_grad():
                for xb, _ in test_loader:
                    xb = xb.to(device)
                    probs = final_model(xb).cpu().numpy()
                    preds = (probs > 0.5).astype(int)
                    y_test_preds.extend(preds)
                    y_test_probs.extend(probs)

            y_test_probs_array = np.array(y_test_probs).reshape(-1)
            y_test_preds_array = np.array(y_test_preds).reshape(-1)

            acc, sens, spec, test_auc, mcc = evaluate_model(y_test, y_test_preds_array, y_test_probs_array)

            print("🧪 Final Test Set Metrics:")
            print(f" - Accuracy     : {acc:.4f}")
            print(f" - Sensitivity  : {sens:.4f}")
            print(f" - Specificity  : {spec:.4f}")
            print(f" - ROC-AUC      : {test_auc:.4f}")
            print(f" - MCC          : {mcc:.4f}")

            print("\n🧾 Confusion Matrix on Hold-Out Test Set:\n")
            cm = confusion_matrix(y_test, y_test_preds_array)
            cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
            print(cm_df)

else: print('FFNN model not chosen')

FFNN model chosen
algpred2 dataset chosen
✅ Loaded: Train=(16120, 320), Test=(4030, 320)

📉 DummyClassifier (Stratified) on Training Set (CV):

📊 Dummy ROC-AUC: 0.4991 ± 0.0000

🚀 5-Fold Cross-Validation (NN) on Training Set...

📂 Fold 1 Metrics:
 - Accuracy     : 0.9696
 - Sensitivity  : 0.9677
 - Specificity  : 0.9715
 - ROC-AUC      : 0.9907
 - MCC          : 0.9392
------
📂 Fold 2 Metrics:
 - Accuracy     : 0.9708
 - Sensitivity  : 0.9733
 - Specificity  : 0.9684
 - ROC-AUC      : 0.9936
 - MCC          : 0.9417
------
📂 Fold 3 Metrics:
 - Accuracy     : 0.9640
 - Sensitivity  : 0.9826
 - Specificity  : 0.9454
 - ROC-AUC      : 0.9944
 - MCC          : 0.9287
------
📂 Fold 4 Metrics:
 - Accuracy     : 0.9665
 - Sensitivity  : 0.9684
 - Specificity  : 0.9646
 - ROC-AUC      : 0.9941
 - MCC          : 0.9330
------
📂 Fold 5 Metrics:
 - Accuracy     : 0.9690
 - Sensitivity  : 0.9684
 - Specificity  : 0.9696
 - ROC-AUC      : 0.9952
 - MCC          : 0.9380
------

✅ Mean CV ROC-AUC: 0

In [24]:
# Confusion matrix values
TP = 1393  # True Positives
FP = 56    # False Positives
FN = 622   # False Negatives
TN = 1959  # True Negatives (not used for F1, but included for completeness)

# Calculate Precision and Recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)

# Calculate F1 Score
f1_score = 2 * (precision * recall) / (precision + recall)

# Print results
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Precision: 0.9614
Recall (Sensitivity): 0.6913
F1 Score: 0.8043


# Testing Models

In [19]:
# Install if needed
# !pip install pandas numpy torch scikit-learn joblib tqdm

import pandas as pd
import numpy as np
import joblib
import torch
from sklearn.metrics import accuracy_score, roc_auc_score
from esm import pretrained
import os
from tqdm import tqdm

# === Load ESM2 model ===
def load_esm2_model():
    model, alphabet = pretrained.esm2_t6_8M_UR50D()
    model.eval()
    batch_converter = alphabet.get_batch_converter()
    return model, alphabet, batch_converter

# === Load trained bootstrap models ===
def load_bootstrap_models(model_dir, base_model_name, n_models=5):
    models = []
    for i in range(1, n_models + 1):
        model_path = os.path.join(model_dir, f"{base_model_name}_bootstrap_{i}.pkl")
        models.append(joblib.load(model_path))
    return models

# === Predict using bootstrap models ===
def predict_bootstrap(sequence, esm_model, batch_converter, bootstrap_models):
    sequence = sequence.strip().upper()
    valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
    if not all(aa in valid_aa for aa in sequence):
        return None, None  # Invalid sequence

    data = [("protein", sequence)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)

    with torch.no_grad():
        results = esm_model(batch_tokens, repr_layers=[6], return_contacts=False)
    token_representations = results["representations"][6]
    cls_embedding = token_representations[0, 0, :].numpy().reshape(1, -1)

    probs = []
    preds = []
    for model in bootstrap_models:
        prob = model.predict_proba(cls_embedding)[0][1]
        pred = model.predict(cls_embedding)[0]
        probs.append(prob)
        preds.append(pred)

    return np.array(preds), np.array(probs)

# === Main evaluation ===
# Paths and settings
csv_file = "allergen_data_with_full_sequences.csv"  # Path to your data
model_dir = "."  # Current folder
model_base_name = "XGBoost_ESM-2_320dim_algpred2"
n_bootstrap_models = 5

# Load models
esm_model, alphabet, batch_converter = load_esm2_model()
bootstrap_models = load_bootstrap_models(model_dir, model_base_name, n_models=n_bootstrap_models)

# Load sequences and labels
df = pd.read_csv(csv_file)
sequences = df["full_parent_protein_sequence"].tolist()
true_labels = df["label"].tolist()

# Predict all
corrected_preds = []
corrected_probs = []
corrected_labels = []

for seq, label in tqdm(zip(sequences, true_labels), total=len(sequences), desc="Predicting sequences"):
    preds, probs = predict_bootstrap(seq, esm_model, batch_converter, bootstrap_models)
    if preds is None:
        continue  # Skip invalid sequence and label
    majority_vote = np.round(np.mean(preds))
    mean_prob = np.mean(probs)
    corrected_preds.append(majority_vote)
    corrected_probs.append(mean_prob)
    corrected_labels.append(label)

# Evaluate
y_true = np.array(corrected_labels)
y_pred = np.array(corrected_preds)
y_proba = np.array(corrected_probs)

acc = accuracy_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_proba)

print(f"✅ Accuracy (Bootstrap Models): {acc:.4f}")
print(f"✅ ROC-AUC (Bootstrap Models): {auc:.4f}")


Predicting sequences:   0%|          | 0/11949 [00:00<?, ?it/s]


ValueError: Feature shape mismatch, expected: 334, got 320

In [None]:

acc = accuracy_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_proba)

print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ ROC-AUC: {auc:.4f}")


✅ Accuracy: 0.5790
✅ ROC-AUC: 0.5386


In [None]:
y_pred.shape

(11945,)

In [None]:
# === Load the final original XGBoost model ===
final_model_path = "XGBoost_ESM-2_320dim_algpred2_xgboost_model.pkl"  # Adjust if needed
final_model = joblib.load(final_model_path)

# === Predict all sequences with final model ===
corrected_preds_final = []
corrected_probs_final = []
corrected_labels_final = []

for seq, label in tqdm(zip(sequences, true_labels), total=len(sequences), desc="Predicting with final model"):
    seq = seq.strip().upper()
    valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
    if not all(aa in valid_aa for aa in seq):
        continue  # Skip sequence and label together

    data = [("protein", seq)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)

    with torch.no_grad():
        results = esm_model(batch_tokens, repr_layers=[6], return_contacts=False)
    token_representations = results["representations"][6]
    cls_embedding = token_representations[0, 0, :].numpy().reshape(1, -1)

    # Predict with the final model
    pred_final = final_model.predict(cls_embedding)[0]
    prob_final = final_model.predict_proba(cls_embedding)[0][1]

    corrected_preds_final.append(pred_final)
    corrected_probs_final.append(prob_final)
    corrected_labels_final.append(label)

# === Evaluate final model ===
y_true_final = np.array(corrected_labels_final)
y_pred_final = np.array(corrected_preds_final)
y_proba_final = np.array(corrected_probs_final)

acc_final = accuracy_score(y_true_final, y_pred_final)
auc_final = roc_auc_score(y_true_final, y_proba_final)

print(f"✅ Final Model Accuracy: {acc_final:.4f}")
print(f"✅ Final Model ROC-AUC: {auc_final:.4f}")


Predicting with final model: 100%|██████████| 11949/11949 [16:35<00:00, 12.00it/s] 

✅ Final Model Accuracy: 0.5540
✅ Final Model ROC-AUC: 0.4697



