In [11]:
import os
import glob
import numpy as np
import pandas as pd
from Bio import PDB

# One-hot encoding map for 20 amino acids
AA_CODES = [
    "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
    "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL"
]
AA_TO_INDEX = {aa: idx for idx, aa in enumerate(AA_CODES)}

# Function to extract amino acid sequence from PDB file
def extract_sequence_from_pdb(pdb_path):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("protein", pdb_path)
    sequence = []

    for model in structure:
        for chain in model:
            for residue in chain:
                resname = residue.get_resname()
                if resname in AA_TO_INDEX:
                    sequence.append(resname)
    return sequence

# One-hot encoding function
def one_hot_encode_sequence(sequence):
    onehot = np.zeros((len(sequence), len(AA_CODES)), dtype=int)
    for i, aa in enumerate(sequence):
        if aa in AA_TO_INDEX:
            onehot[i, AA_TO_INDEX[aa]] = 1
    return onehot

# Base path and dataset folders
base_path = "/Users/marcobenavides/Documents/Columbia University/Spring 2025/DL Biomedical Imaging/Project"
dataset_folders = [
    "protein_antigens_non-immunogenic",
    "protein_antigens_immunogenic",
    "bacterial_non_antigens",
    "bacterial_antigens"
]

# Process each dataset folder
for folder in dataset_folders:
    pdb_folder = os.path.join(base_path, folder, "pdb_files")
    voronoi_folder = os.path.join(base_path, folder, "voronoi_images")
    output_folder = os.path.join(base_path, folder, "onehot_sequences")
    os.makedirs(output_folder, exist_ok=True)

    pdb_files = glob.glob(os.path.join(pdb_folder, "*.pdb"))

    for pdb_file in pdb_files:
        pdb_name = os.path.splitext(os.path.basename(pdb_file))[0]
        csv_path = os.path.join(output_folder, f"{pdb_name}_onehot.csv")

        # Skip if output CSV already exists
        if os.path.exists(csv_path):
            print(f"One-hot file already exists for {pdb_name}, skipping.")
            continue

        # Check for possible Voronoi image naming variations
        voronoi_standard = os.path.join(voronoi_folder, f"{pdb_name}.png")
        voronoi_notstandard = os.path.join(voronoi_folder, f"{pdb_name}.pdb.png")
        voronoi_af_remote = os.path.join(voronoi_folder, f"AF_REMOTE_{pdb_name}.png")
        voronoi_pdb_prefix = os.path.join(voronoi_folder, f"PDB_{pdb_name}.png")

        if os.path.exists(voronoi_standard) or os.path.exists(voronoi_af_remote) or os.path.exists(voronoi_pdb_prefix) or os.path.exists(voronoi_notstandard):
            sequence = extract_sequence_from_pdb(pdb_file)
            if sequence:
                onehot = one_hot_encode_sequence(sequence)
                df = pd.DataFrame(onehot, columns=AA_CODES)
                df.to_csv(csv_path, index=False)
                print(f"Saved one-hot encoding for {pdb_name} to {csv_path}")
            else:
                print(f"No valid amino acid sequence found in {pdb_name}")
        else:
            print(f"Voronoi image not found for {pdb_name}, skipping.")


One-hot file already exists for AF-K7EMK9-F1, skipping.
One-hot file already exists for 6obd, skipping.
One-hot file already exists for AF-A0A7I2V363-F1, skipping.
One-hot file already exists for AF-F8VP99-F1, skipping.
One-hot file already exists for AF-A0A191URJ7-F1, skipping.
One-hot file already exists for AF-M0R2M5-F1, skipping.
One-hot file already exists for AF-H0YNH3-F1, skipping.
One-hot file already exists for 7p0p, skipping.
One-hot file already exists for AF-Q86SH4-F1, skipping.
One-hot file already exists for AF-A0A096LPD5-F1, skipping.
One-hot file already exists for AF-F5H6I1-F1, skipping.
One-hot file already exists for AF-Q5STR5-F1, skipping.
One-hot file already exists for 2p31, skipping.
One-hot file already exists for AF-C9J8Y3-F1, skipping.
One-hot file already exists for AF-Q5SRE5-F1, skipping.
One-hot file already exists for AF-A0A6Q8PGV7-F1, skipping.
One-hot file already exists for AF-P62829-F1, skipping.
One-hot file already exists for AF-Q9H5J4-F1, skipping.
