In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
from enum import Enum
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from itertools import product

import os, sys
sys.path.append(os.path.abspath("../../etc/"))
import config

In [2]:
df_raw = pd.read_csv('./data/raw_transformation_01.csv', index_col=0)
df_processed = pd.read_csv('./data/01_binding_labels.csv', index_col=0)

In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12918 entries, 0 to 12917
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Name                 12918 non-null  object
 1   Ab or Nb             12918 non-null  object
 2   Binds to             12918 non-null  object
 3   Doesn't Bind to      3201 non-null   object
 4   Neutralising Vs      6373 non-null   object
 5   Not Neutralising Vs  5482 non-null   object
 6   Protein + Epitope    12914 non-null  object
 7   Origin               12913 non-null  object
 8   VHorVHH              12918 non-null  object
 9   VL                   12112 non-null  object
 10  Heavy V Gene         12918 non-null  object
 11  Heavy J Gene         12918 non-null  object
 12  Light V Gene         12112 non-null  object
 13  Light J Gene         12112 non-null  object
 14  CDRH3                12918 non-null  object
 15  CDRL3                12107 non-null  object
dtypes: object

In [4]:
get_length = lambda col : df_raw[col].str.len().fillna(0).astype(int)

lowercase_col_names = lambda col: str.lower(col).replace(' ', '_')

AMINO_ACIDS: str = config.AMINO_ACID_ALPHABETS or ""
BIOLOGICAL_FEATURES: list[str] = config.EXTRACTABLE_BIOSEQUENCE_FEATURES or []


def get_naive_biosequence_information(df: pd.DataFrame, df_processed: pd.DataFrame, col: str):
    print(f"Processing {col}...")
    
    temp_series = df[col].fillna("")
    lengths = temp_series.str.len()

    new_cols = {
        f'{col}_len': lengths.replace(0, 1) 
    }

    for aa in AMINO_ACIDS:
        counts = temp_series.str.count(aa)
        new_cols[f'{col}_amino_acid_percentage_{aa}'] = counts / new_cols[f'{col}_len']
        
    new_cols[f'{col}_len'] = lengths
    
    new_df = pd.concat([df_processed, pd.DataFrame(new_cols)], axis=1)
    return new_df

for seq in BIOLOGICAL_FEATURES:
    df_processed = get_naive_biosequence_information(df_raw, df_processed, seq)


Processing CDRH3...
Processing CDRL3...
Processing VL...
Processing VHorVHH...


In [5]:
df_processed.describe()

Unnamed: 0,is_binding_SARS-CoV2_WT,is_neutral_SARS-CoV2_WT,is_nanobody,CDRH3_len,CDRH3_amino_acid_percentage_A,CDRH3_amino_acid_percentage_C,CDRH3_amino_acid_percentage_D,CDRH3_amino_acid_percentage_E,CDRH3_amino_acid_percentage_F,CDRH3_amino_acid_percentage_G,...,VHorVHH_amino_acid_percentage_M,VHorVHH_amino_acid_percentage_N,VHorVHH_amino_acid_percentage_P,VHorVHH_amino_acid_percentage_Q,VHorVHH_amino_acid_percentage_R,VHorVHH_amino_acid_percentage_S,VHorVHH_amino_acid_percentage_T,VHorVHH_amino_acid_percentage_V,VHorVHH_amino_acid_percentage_W,VHorVHH_amino_acid_percentage_Y
count,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,...,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0
mean,1.030423,1.095758,0.062007,15.822728,0.104839,0.009948,0.114407,0.023903,0.057473,0.11154,...,0.018768,0.049214,0.027759,0.048904,0.045653,0.11914,0.071675,0.081481,0.029617,0.060832
std,0.356217,0.748488,0.241177,3.93518,0.061707,0.030721,0.061182,0.038948,0.052016,0.071956,...,0.010148,0.097828,0.012796,0.015671,0.016626,0.033797,0.026962,0.022509,0.009736,0.020906
min,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,13.0,0.0625,0.0,0.071429,0.0,0.0,0.0625,...,0.015748,0.023077,0.016949,0.040984,0.035714,0.106557,0.056911,0.07377,0.02459,0.049587
50%,1.0,1.0,0.0,15.0,0.090909,0.0,0.111111,0.0,0.058824,0.105263,...,0.016807,0.031746,0.025,0.049587,0.048387,0.121951,0.070866,0.084034,0.03125,0.062016
75%,1.0,2.0,0.0,18.0,0.142857,0.0,0.153846,0.052632,0.083333,0.153846,...,0.024793,0.03937,0.033898,0.058824,0.057377,0.137931,0.088,0.094017,0.033613,0.07438
max,2.0,2.0,1.0,63.0,0.5,0.230769,0.5,0.272727,0.363636,0.444444,...,0.064516,0.5,0.081967,0.094828,0.100775,0.208333,0.184,0.140625,0.067797,0.132812


In [6]:
def get_one_hot_epitopes(df, column_name):
    clean_col = df[column_name].astype(str).str.upper()
    
    # Note: Order matters here if I want to avoid 'Other_Spike' overlapping
    mappings = {
        'S_RBD': clean_col.str.contains('RBD'),
        'S_NTD': clean_col.str.contains('NTD'),
        'S_S2':  clean_col.str.contains('S2'),
        'S_S1':  clean_col.str.contains('S1'),
        'N_Protein': clean_col.str.contains('N') & ~clean_col.str.contains('S')
    }
    encoding_df = pd.DataFrame(mappings).astype(int)
    is_spike = clean_col.str.contains('S')
    hit_any_specific = encoding_df.any(axis=1)
    encoding_df['Other_Spike'] = (is_spike & ~hit_any_specific).astype(int)
    encoding_df['Unknown'] = (df[column_name].isna() | 
                              clean_col.isin(['UNKNOWN', 'TBC', 'NAN']) | 
                              ~encoding_df.any(axis=1)).astype(int)
    
    return encoding_df

# Usage:
one_hot_df = get_one_hot_epitopes(df_raw, 'Protein + Epitope')

In [10]:
df_processed = pd.concat([df_processed, one_hot_df], axis=1)

OUTFILE_PATH = './data/02_naive_processed_features.csv'
df_processed.drop(columns=[config.BINDING_TARGET,config.NEUTRAL_TARGET,config.IS_NANOBODY_COL,'name']).to_csv(OUTFILE_PATH)

In [9]:
df_processed

Unnamed: 0,is_binding_SARS-CoV2_WT,is_neutral_SARS-CoV2_WT,is_nanobody,name,CDRH3_len,CDRH3_amino_acid_percentage_A,CDRH3_amino_acid_percentage_C,CDRH3_amino_acid_percentage_D,CDRH3_amino_acid_percentage_E,CDRH3_amino_acid_percentage_F,...,VHorVHH_amino_acid_percentage_V,VHorVHH_amino_acid_percentage_W,VHorVHH_amino_acid_percentage_Y,S_RBD,S_NTD,S_S2,S_S1,N_Protein,Other_Spike,Unknown
0,1,1,0,Curtis_3548_S-2,13,0.076923,0.0,0.230769,0.000000,0.076923,...,0.000000,0.000000,0.000000,1,0,0,0,0,0,0
1,1,0,0,Curtis_3548_S-7,14,0.071429,0.0,0.071429,0.071429,0.000000,...,0.000000,0.000000,0.000000,1,0,0,0,0,0,0
2,1,0,0,Curtis_3548_RBD-15,14,0.071429,0.0,0.000000,0.000000,0.071429,...,0.000000,0.000000,0.000000,1,0,0,0,0,0,0
3,1,1,0,8-D9,11,0.181818,0.0,0.181818,0.000000,0.000000,...,0.095652,0.026087,0.052174,1,0,0,0,0,0,0
4,1,1,0,Sun_1G11,18,0.000000,0.0,0.166667,0.000000,0.055556,...,0.080000,0.032000,0.064000,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12913,1,0,0,BD55-6725,16,0.125000,0.0,0.125000,0.062500,0.125000,...,0.073171,0.024390,0.065041,1,0,0,0,0,0,0
12914,1,0,0,BD55-6726,13,0.153846,0.0,0.076923,0.000000,0.076923,...,0.091667,0.041667,0.041667,1,0,0,0,0,0,0
12915,1,1,0,BD55-6727,17,0.176471,0.0,0.058824,0.117647,0.058824,...,0.056452,0.024194,0.064516,1,0,0,0,0,0,0
12916,1,0,0,BD55-6728,14,0.071429,0.0,0.071429,0.142857,0.071429,...,0.074380,0.024793,0.057851,1,0,0,0,0,0,0
