In [1]:
import pandas as pd
import random

In [2]:
import os, sys
sys.path.append(os.path.abspath("../../etc/"))
import config

In [3]:
df = pd.read_csv('../../data/CoV-AbDab_080224.csv')

In [4]:
def generate_hard_negatives(df, target_count, binding_col=config.BINDING_YES, neutral_col=config.NEUTRAL_YES):
    """
    Creates decoy negatives by shuffling CDR loops. 
    Maintains 'ND' and NaN values to preserve Nanobody vs Antibody logic.
    """
    
    # Sample from existing binders (Class 1) to ensure realistic scaffolds
    positives = df[df[binding_col] == config.TARGET].sample(n=target_count, replace=True if target_count > len(df) else False)
    negatives = positives.copy()

    def safe_shuffle(seq):
        # Return as is if it's ND, NaN, or too short to shuffle meaningfully
        if pd.isna(seq) or str(seq).upper() == 'ND' or len(str(seq)) < 2:
            return seq
        
        # Convert to list, shuffle, and rejoin
        char_list = list(str(seq))
        random.shuffle(char_list)
        return "".join(char_list)

    # Apply shuffling only to the CDR loops
    # Shuffling the 'fingers' but keeping the Framework intact
    negatives['CDRH3'] = negatives['CDRH3'].apply(safe_shuffle)
    
    if 'CDRL3' in negatives.columns:
        negatives['CDRL3'] = negatives['CDRL3'].apply(safe_shuffle)

    # Mark as non-binding and non-neutralizing
    negatives[config.NOT_BINDING] = config.TARGET
    negatives[config.NOT_NEUTRAL] = config.TARGET
    negatives[binding_col] = None
    negatives[neutral_col] = None
    
    # Optional: Tag them to keep track them in EDA
    negatives['data_source'] = 'synthetic_negative'
    
    return negatives


df_synthetic = generate_hard_negatives(df, target_count=5000)

# Combine with original data
df_balanced = pd.concat([df, df_synthetic], axis=0).reset_index(drop=True)

In [5]:
df_balanced.to_csv("./data/raw_transformation_00.csv")

In [6]:
%run "01_label_processor.ipynb"

In [7]:
%run "02_naive_processor.ipynb"

<class 'pandas.core.frame.DataFrame'>
Index: 17918 entries, 0 to 17917
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           17918 non-null  int64 
 1   Name                 17918 non-null  object
 2   Ab or Nb             17918 non-null  object
 3   Binds to             12918 non-null  object
 4   Doesn't Bind to      8201 non-null   object
 5   Neutralising Vs      6373 non-null   object
 6   Not Neutralising Vs  10482 non-null  object
 7   Protein + Epitope    17913 non-null  object
 8   Origin               17910 non-null  object
 9   VHorVHH              17918 non-null  object
 10  VL                   16671 non-null  object
 11  Heavy V Gene         17918 non-null  object
 12  Heavy J Gene         17918 non-null  object
 13  Light V Gene         16671 non-null  object
 14  Light J Gene         16671 non-null  object
 15  CDRH3                17918 non-null  object
 16  CDRL3    

In [8]:
%run "03_motif_processor.ipynb"

In [9]:
%run "04_biochemical_processor.ipynb"

Skipping sequence HTTPSWWWRCSBORGSTRUCTUREMJHHTTPSWWWRCSBORGSTRUCTUREMJI due to invalid character: 'B'
Skipping sequence SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVLVIYKDSERPSGIPERFSGSTSGTTVTLTISGVQAEDEADYHCQSADSSGTSRVFGGXGPS due to invalid character: 'X'
Skipping sequence EVQLVESGGGLVQPGRSLRLSCAASGFTFGDYAMHWVRQAPGKGLEWVSGINWNGHSIAYADSVKGRFTISRENAKXSLYLQMNSLRAEDTAFYYCAKDTAAGYGDYVHYWGQGALVTVSS due to invalid character: 'X'
Skipping sequence EVQLVESGGGLVQPGRSLRLSCAASGFTFGDYAMHWVRQAPGKGLEWVSGINWNGHSIAYADSVKGRFTISRENAKXSLYLQMNSLRAEDTAFYYCAKDTAAGYGDYVHYWGQGALVTVSS due to invalid character: 'X'
