In [1]:
import pandas as pd
from peptides import Peptide
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import re
import os, sys
sys.path.append(os.path.abspath("../../etc/"))
import config

In [2]:
df_raw = pd.read_csv("./data/raw_transformation_01.csv")
df_processed = pd.read_csv("./data/01_binding_labels.csv")

In [3]:
def extract_biochemical_features(sequence, col):
    # Handle NaNs or empty strings
    if pd.isna(sequence) or str(sequence).strip() == "":
        return None
    
    # SANITIZATION: Remove spaces and non-AA characters
    # This keeps only A-Z and removes anything else (like spaces or '*')
    clean_seq = re.sub(r'[^A-Z]', '', str(sequence).upper())
    
    # Biopython's instability_index requires at least 2 AAs
    if len(clean_seq) < 2:
        return None

    try:
        p = Peptide(clean_seq)
        analysed_seq = ProteinAnalysis(clean_seq)
        
        features = {
            # 1. Physicochemical
            f"{col}_hydrophobicity": p.hydrophobicity(),
            f"{col}_isoelectric_point": p.isoelectric_point(),
            f"{col}_charge_ph7": p.charge(pH=7.4),
            f"{col}_molecular_weight": p.molecular_weight(),
            
            # 2. Stability & Structure
            f"{col}_aliphatic_index": p.aliphatic_index(),
            f"{col}_instability_index": analysed_seq.instability_index(),
            f"{col}_aromaticity": analysed_seq.aromaticity(),
            
            # 3. Binding Potential
            f"{col}_boman_index": p.boman(),
            f"{col}_flexibility": sum(analysed_seq.flexibility()) / len(clean_seq)
        }
        return features
    except KeyError as e:
        
        print(f"Skipping sequence {clean_seq} due to invalid character: {e}")
        return None



In [4]:
feature_df = []
for seq in config.EXTRACTABLE_BIOSEQUENCE_FEATURES:
    feature_df.append(df_raw[seq].apply(lambda row: extract_biochemical_features(row, seq)).apply(pd.Series))

Skipping sequence HTTPSWWWRCSBORGSTRUCTUREMJHHTTPSWWWRCSBORGSTRUCTUREMJI due to invalid character: 'B'
Skipping sequence SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVLVIYKDSERPSGIPERFSGSTSGTTVTLTISGVQAEDEADYHCQSADSSGTSRVFGGXGPS due to invalid character: 'X'
Skipping sequence EVQLVESGGGLVQPGRSLRLSCAASGFTFGDYAMHWVRQAPGKGLEWVSGINWNGHSIAYADSVKGRFTISRENAKXSLYLQMNSLRAEDTAFYYCAKDTAAGYGDYVHYWGQGALVTVSS due to invalid character: 'X'


In [5]:
out_df = pd.concat(feature_df,axis=1)

In [6]:
OUTFILE_PATH = "./data/04_antibody_only_physicochemical_properties.csv"
out_df.to_csv(OUTFILE_PATH)

In [7]:
out_df

Unnamed: 0,CDRH3_hydrophobicity,CDRH3_isoelectric_point,CDRH3_charge_ph7,CDRH3_molecular_weight,CDRH3_aliphatic_index,CDRH3_instability_index,CDRH3_aromaticity,CDRH3_boman_index,CDRH3_flexibility,CDRL3_hydrophobicity,...,VL_flexibility,VHorVHH_hydrophobicity,VHorVHH_isoelectric_point,VHorVHH_charge_ph7,VHorVHH_molecular_weight,VHorVHH_aliphatic_index,VHorVHH_instability_index,VHorVHH_aromaticity,VHorVHH_boman_index,VHorVHH_flexibility
0,-1.661538,6.516511,-0.006719,1584.66824,37.692308,45.515385,0.153846,5.419231,0.313815,3.416667e-01,...,0.000000,-3.500000,3.749987,-1.004916,247.20764,0.000000,5.000000,0.000000,7.680000,0.000000
1,-1.378571,6.482197,-0.008492,1545.64654,7.142857,45.614286,0.142857,3.247857,0.364631,-1.544444e+00,...,0.000000,-3.500000,3.749987,-1.004916,247.20764,0.000000,5.000000,0.000000,7.680000,0.000000
2,-0.278571,9.145600,0.989899,1549.70264,35.000000,31.021429,0.285714,0.393571,0.358007,-9.868649e-17,...,0.000000,-3.500000,3.749987,-1.004916,247.20764,0.000000,5.000000,0.000000,7.680000,0.000000
3,-0.654545,6.498637,-0.928173,1191.26864,89.090909,9.090909,0.000000,2.789091,0.180587,-1.000000e+00,...,0.915322,-0.165217,8.204080,0.775477,12428.90994,75.391304,39.674087,0.104348,1.387478,0.917896
4,-0.905556,3.826102,-2.011701,2114.20904,59.444444,46.888889,0.222222,2.469444,0.503837,-5.555556e-02,...,0.913088,-0.332000,5.815560,-1.268049,13760.24904,69.360000,41.116080,0.136000,1.558400,0.927044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17913,0.003571,4.493927,-1.976250,3297.73224,97.500000,24.653571,0.214286,0.968571,0.670091,9.555556e-01,...,0.000000,-3.500000,3.749987,-1.004916,247.20764,0.000000,5.000000,0.000000,7.680000,0.000000
17914,-0.042857,9.171628,0.990631,1580.71964,49.285714,15.772143,0.285714,1.287857,0.354747,-1.950000e+00,...,0.924874,-0.138017,9.055469,2.771024,13364.99324,70.909091,40.834711,0.140496,1.389091,0.917830
17915,-0.541176,9.171063,0.990801,1827.97384,45.882353,50.164706,0.176471,1.893529,0.466490,-1.720000e+00,...,0.919476,-0.195935,8.649007,1.731078,13341.90814,72.845528,5.977236,0.130081,1.053008,0.924520
17916,-0.959091,5.327216,-1.940796,2635.75994,30.909091,65.222727,0.272727,2.305455,0.579176,-1.633333e+00,...,0.918550,-0.440310,6.777655,-1.200915,14517.08294,64.186047,47.575969,0.155039,1.661705,0.926492
