In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
from enum import Enum
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from itertools import product
from dotenv import load_dotenv

import os, sys
sys.path.append(os.path.abspath("../../etc/"))
import config

In [None]:
df_raw: pd.DataFrame = pd.read_csv('./data/raw_transformation_00.csv')
df_processed = pd.DataFrame()

In [3]:
RAW_FEATURES_SEQUENCE = config.EXTRACTABLE_BIOSEQUENCE_FEATURES

def data_cleaning(df: pd.DataFrame):
    df = df.drop(columns=config.IGNORED_FEATURES)
    has_existing_data = lambda df, col : df[df[col].notna() & (df[col]) != 'ND']
    for bio in RAW_FEATURES_SEQUENCE:
        df = has_existing_data(df, bio)
    return df

df_raw = data_cleaning(df_raw)

In [4]:
df_raw.head()

Unnamed: 0,Name,Ab or Nb,Binds to,Doesn't Bind to,Neutralising Vs,Not Neutralising Vs,Protein + Epitope,Origin,VHorVHH,VL,Heavy V Gene,Heavy J Gene,Light V Gene,Light J Gene,CDRH3,CDRL3
0,Curtis_3548_S-2,Ab,SARS-CoV2_WT;SARS-CoV2_Beta,SARS-CoV2_Omicron-BA1;HKU1,SARS-CoV2_WT (weak),,S; RBD/non-RBD,B-cells; SARS-CoV2 Human Patient,ND,ND,IGHV4-31 (Human),IGHJ4 (Human),ND,ND,ARGSRNDLRDFDY,QSYNSSLSGLVV
1,Curtis_3548_S-7,Ab,SARS-CoV2_WT;SARS-CoV2_Beta,SARS-CoV2_Omicron-BA1;HKU1,,SARS-CoV2_WT,S; non-RBD,B-cells; SARS-CoV2 Human Patient,ND,ND,IGHV4-34 (Human),IGHJ4 (Human),ND,ND,AREPYSSGMGGRDY,QQYGSSPYT
2,Curtis_3548_RBD-15,Ab,SARS-CoV2_WT,SARS-CoV2_Beta;SARS-CoV2_Omicron-BA1;HKU1,,SARS-CoV2_WT,S; iso-RBD,B-cells; SARS-CoV2 Human Patient,ND,ND,IGHV4-59 (Human),IGHJ5 (Human),ND,ND,AKGIYSSSSYWFGP,QAWDSSTVV
3,8-D9,Ab,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2 Human Vaccinee (BBIBP-CorV),VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,IGHV3-66 (Human),IGHJ3 (Human),IGKV1-9 (Human),IGKJ3 (Human),ARDHSGHALDI,QHLNSYPSMYT
4,Sun_1G11,Ab,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron-BQ1;SARS-CoV2_Omicron-BQ1.1;...,S; RBD,B-cells; SARS-CoV2 Human Patient,EVQLVESGGGLVQPGRSLRLSCAASGFKFDDYAMHWVRQAPGKGLE...,DIQLTQSPSFLSASVGDRVTITCRASQGIDKYLAWYQQKPGQAPKV...,IGHV3-9 (Human),IGHJ4 (Human),IGKV1-9 (Human),IGKJ3 (Human),VKDSNYDSSGYLINNFDY,QQLYTFPVT


In [None]:
NEUTRAL_VS = config.NEUTRAL_YES or ""
NOT_NEUTRAL = config.NOT_NEUTRAL or ""
BINDING_VS = config.BINDING_YES or ""
NOT_BINDING = config.NOT_BINDING or ""

TARGET_NAME = config.TARGET or ""
NON_TARGETS_SARS_CoV2_WT = config.NON_TARGETS or ""

In [6]:

def label_neutralization(row, name: str):
   return 1 if one_hot_encode(row, NEUTRAL_VS, name) else (0 if one_hot_encode(row, NOT_NEUTRAL, name) else 2)

def label_binding(row, name: str):
    return 1 if one_hot_encode(row, BINDING_VS, name) else (0 if one_hot_encode(row, NOT_BINDING, name) else 2)

def label_antibody(row, name: str):
    return 1 if one_hot_encode(row, 'Ab or Nb', name) else 0
    
def get_list_from_cell(cell):
    """Safely converts semicolon-delimited strings to a set of strains."""
    if pd.isna(cell) or cell == '':
        return set()
    return set(str(cell).split(';'))

def one_hot_encode(row, col, name): 
    if pd.isna(row[col]) or row[col] == '':
        return False
    return name in str(row[col])

def label_binding_advanced(row):
    binds_set = get_list_from_cell(row[BINDING_VS])
    not_binds_set = get_list_from_cell(row[NOT_BINDING])
    
    # 1. PRIMARY RULE: Explicit Binder
    if TARGET_NAME in binds_set:
        return 1
    
    # 2. PRIMARY RULE: Explicit Non-Binder
    if TARGET_NAME in not_binds_set:
        return 0
    
    # 3. SECONDARY RULE: Hard Negatives
    # If it binds to MERS, OC43, etc., but NOT Wuhan, we treat it as 0
    if binds_set.intersection(NON_TARGETS_SARS_CoV2_WT):
        return 0
        
    return 2

df_processed[config.BINDING_TARGET] = df_raw.apply(label_binding_advanced, axis=1)
df_processed[config.NEUTRAL_TARGET] = df_raw.apply(lambda row: label_neutralization(row, TARGET_NAME), axis=1)
df_processed['is_nanobody'] = df_raw.apply(lambda row: label_antibody(row, 'Nb'), axis=1)
df_processed['name'] = df_raw['Name']

In [7]:
OUTFILE_PATH_LABELS = "./data/01_binding_labels.csv"
OUTFILE_PATH_RAW = "./data/raw_transformation_01.csv"

df_processed.to_csv(OUTFILE_PATH_LABELS, index=True)
df_raw.to_csv(OUTFILE_PATH_RAW, index=True)

In [8]:
df_processed['is_binding_SARS-CoV2_WT'].value_counts()

is_binding_SARS-CoV2_WT
1    11267
2     1022
0      629
Name: count, dtype: int64

In [9]:
df_processed['is_neutral_SARS-CoV2_WT'].value_counts()

is_neutral_SARS-CoV2_WT
1    5563
2    4296
0    3059
Name: count, dtype: int64

In [10]:
df_raw['Doesn\'t Bind to'].value_counts()

Doesn't Bind to
OC43;HKU1                                                   1051
SARS-CoV1                                                    759
SARS-CoV2_WT;SARS-CoV1;Pangolin-GD                           300
SARS-CoV2_WT                                                  93
229E;HKU1;NL63;OC43                                           70
                                                            ... 
SARS-CoV2_Beta;SARS-CoV2_Gamma;SARS-CoV1                       1
SARS-CoV2_Alpha;SARS-CoV2_Beta;SARS-CoV1                       1
SARS-CoV2_Alpha;SARS-CoV2_Beta;SARS-CoV2_Delta;SARS-CoV1       1
SARS-CoV2_Delta;SARS-CoV1                                      1
SARS-CoV2_Beta;SARS-CoV2_Gamma;SARS-CoV2_Delta;SARS-CoV1       1
Name: count, Length: 182, dtype: int64