## Merge the structural features from DSSP with AlgPred 2.0 full protein seq

In [None]:
# In one‑hot‑encoding.ipynb

import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# === Paths ===
DATA_DIR = "/Users/jianzhouyao/AllergenPredict/data"
OUT_DIR = os.path.join(DATA_DIR, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

# === 1) Load & merge CSVs ===
# Structure
structure_train = pd.read_csv(os.path.join(DATA_DIR, "global_3d_summary_train.csv"))
structure_test = pd.read_csv(os.path.join(DATA_DIR, "global_3d_summary_test.csv"))
structure_all = pd.concat([structure_train, structure_test], ignore_index=True)
# extract id from PDB_File
structure_all["id"] = structure_all["PDB_File"].str.replace(".pdb", "", regex=False)

# Sequence
seq_train = pd.read_csv(os.path.join(DATA_DIR, "algpred2_train_seq.csv"))
seq_test = pd.read_csv(os.path.join(DATA_DIR, "algpred2_test_seq.csv"))
sequence_all = pd.concat([seq_train, seq_test], ignore_index=True)

# Merge on 'id' and drop PDB_File
merged = pd.merge(sequence_all, structure_all, on="id", how="inner")
merged = merged.drop(columns=["PDB_File"])
print("Merged shape:", merged.shape)
merged.head()

# === 2) Train/test split ===
train_df, test_df = train_test_split(
    merged, test_size=0.2, stratify=merged["label"], random_state=42
)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
print("Train / Test shapes:", train_df.shape, test_df.shape)

# === 3) Normalize structure features (fit on train) ===
struct_cols = [
    "Total_SASA",
    "Radius_of_Gyration",
    "Compactness",
    "Contact_Order",
    "SS_Helix",
    "SS_Strand",
    "SS_Coil",
]
scaler = StandardScaler()
X_struct_train = scaler.fit_transform(train_df[struct_cols])
X_struct_test = scaler.transform(test_df[struct_cols])

# save the scaler
with open(os.path.join(OUT_DIR, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

# === 4) One-hot encode sequences (common max_len) ===
AA_ALPHABET = "ACDEFGHIKLMNPQRSTVWY"
AA_TO_IDX = {aa: i for i, aa in enumerate(AA_ALPHABET)}
max_len = max(train_df["sequence"].str.len().max(), test_df["sequence"].str.len().max())


def encode_seqs(seq_series):
    n = len(seq_series)
    X = np.zeros((n, len(AA_ALPHABET), max_len), dtype=np.float32)
    for i, seq in enumerate(seq_series):
        for pos, aa in enumerate(seq[:max_len]):
            idx = AA_TO_IDX.get(aa)
            if idx is not None:
                X[i, idx, pos] = 1.0
    return X


X_seq_train = encode_seqs(train_df["sequence"])
X_seq_test = encode_seqs(test_df["sequence"])

# === 5) Extract labels & ids ===
y_train = train_df["label"].values.astype(np.int64)
y_test = test_df["label"].values.astype(np.int64)
ids_train = train_df["id"].values
ids_test = test_df["id"].values

# === 6) Save to .npy ===
np.save(os.path.join(OUT_DIR, "X_seq_train.npy"), X_seq_train)
np.save(os.path.join(OUT_DIR, "X_seq_test.npy"), X_seq_test)
np.save(os.path.join(OUT_DIR, "X_struct_train.npy"), X_struct_train)
np.save(os.path.join(OUT_DIR, "X_struct_test.npy"), X_struct_test)
np.save(os.path.join(OUT_DIR, "y_train.npy"), y_train)
np.save(os.path.join(OUT_DIR, "y_test.npy"), y_test)
np.save(os.path.join(OUT_DIR, "ids_train.npy"), ids_train)
np.save(os.path.join(OUT_DIR, "ids_test.npy"), ids_test)

print("✔ Preprocessing complete; files written to", OUT_DIR)

     id                                           sequence  label  Total_SASA  \
0  P_13  MGKPFTLSLSSLCLLLLSSACFAISSSKLNECQLNNLNALEPDHRV...      1   44718.500   
1  P_14  MGVFTFEDEINSPVAPATLYKALVTDADNVIPKALDSFKSVENVEG...      1    9042.496   
2  P_17  MAEDEDNQQGQGEGLKYLGFVQDAATYAVTTFSNVYLFAKDKSGPL...      1   11549.510   
3  P_46  MGVFNYEVETPSVISAARLFKSYVLDGDKLIPKVAPQAITSVENVG...      1    8775.385   
4  P_47  MGVFNYEVETPSVIPAARLFKSYVLDGDKLIPKVAPQAITSVENVE...      1    8923.741   

   Radius_of_Gyration  Compactness  Contact_Order  SS_Helix  SS_Strand  \
0           33.363918    16.844544       0.056430  0.190391   0.249110   
1           15.485167    10.203313       0.126789  0.284810   0.449367   
2           21.802948     6.329419       0.044412  0.833333   0.000000   
3           15.370365    10.409642       0.130406  0.262500   0.412500   
4           15.408467    10.383901       0.127845  0.262500   0.412500   

    SS_Coil  
0  0.560498  
1  0.265823  
2  0.166667  
3  0.325000 

In [8]:
# In preprocess.ipynb

import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

# === Paths ===
DATA_DIR = "/Users/jianzhouyao/AllergenPredict/data"
OUTPUT_DIR = os.path.join(DATA_DIR, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === 1) Load merged data ===
merged_csv = os.path.join(DATA_DIR, "merged_allergen_data.csv")
df = pd.read_csv(merged_csv)

# === 2) Normalize structure features ===
struct_cols = [
    "Total_SASA",
    "Radius_of_Gyration",
    "Compactness",
    "Contact_Order",
    "SS_Helix",
    "SS_Strand",
    "SS_Coil",
]
scaler = StandardScaler()
df[struct_cols] = scaler.fit_transform(df[struct_cols])

# === Save the scaler for later inference ===
scaler_path = os.path.join(OUTPUT_DIR, "scaler.pkl")
with open(scaler_path, "wb") as f:
    pickle.dump(scaler, f)

# === 3) One‑hot‑encode sequences ===
AA_ALPHABET = "ACDEFGHIKLMNPQRSTVWY"
AA_TO_IDX = {aa: i for i, aa in enumerate(AA_ALPHABET)}
max_len = df["sequence"].str.len().max()

# Pre‑allocate array: (n_samples, 20 channels, max_len positions)
X_seq = np.zeros((len(df), len(AA_ALPHABET), max_len), dtype=np.float32)
for i, seq in enumerate(df["sequence"]):
    for pos, aa in enumerate(seq[:max_len]):
        idx = AA_TO_IDX.get(aa)
        if idx is not None:
            X_seq[i, idx, pos] = 1.0

# === 4) Extract labels and normalized structural features ===
y = df["label"].values.astype(np.int64)
X_struct = df[struct_cols].values.astype(np.float32)
ids = df["id"].values

# === 5) Save everything as .npy for fast PyTorch loading ===
np.save(os.path.join(OUTPUT_DIR, "X_seq.npy"), X_seq)
np.save(os.path.join(OUTPUT_DIR, "X_struct.npy"), X_struct)
np.save(os.path.join(OUTPUT_DIR, "y.npy"), y)
np.save(os.path.join(OUTPUT_DIR, "ids.npy"), ids)

print(f"Preprocessing done. Files written to {OUTPUT_DIR}/")

Preprocessing done. Files written to /Users/jianzhouyao/AllergenPredict/data/outputs/


In [16]:
import os, numpy as np

# 1. confirm files exist
for fn in [
    "/Users/jianzhouyao/AllergenPredict/data/outputs/X_seq.npy",
    "/Users/jianzhouyao/AllergenPredict/data/outputs/X_struct.npy",
    "/Users/jianzhouyao/AllergenPredict/data/outputs/y.npy",
    "/Users/jianzhouyao/AllergenPredict/data/outputs/ids.npy",
    "/Users/jianzhouyao/AllergenPredict/data/outputs/scaler.pkl",
]:
    assert os.path.exists(fn), f"Missing {fn}"

# 2. confirm shapes/types
X_seq = np.load("/Users/jianzhouyao/AllergenPredict/data/outputs/X_seq.npy")
X_struct = np.load("/Users/jianzhouyao/AllergenPredict/data/outputs/X_struct.npy")
y = np.load("/Users/jianzhouyao/AllergenPredict/data/outputs/y.npy")
assert X_seq.ndim == 3 and X_seq.dtype == np.float32
assert X_struct.ndim == 2 and X_struct.shape[0] == X_seq.shape[0]
assert y.ndim == 1 and len(y) == X_seq.shape[0]
print("✅ All outputs present and shapes look good.")

✅ All outputs present and shapes look good.
