In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.concat([pd.read_csv(f"../nos_data/source_data/train_{i}.csv") for i in range(10)])
df.dropna(subset=['sasa'], inplace=True)

In [6]:
df.columns

Index(['seq', 'length', 'molecular_weight', 'aromaticity', 'instability_index',
       'isoelectric_point', 'avg_flexibility', 'gravy', 'ss_perc_helix',
       'ss_perc_turn', 'ss_perc_sheet', 'aa_perc_A', 'aa_perc_C', 'aa_perc_D',
       'aa_perc_E', 'aa_perc_F', 'aa_perc_G', 'aa_perc_H', 'aa_perc_I',
       'aa_perc_K', 'aa_perc_L', 'aa_perc_M', 'aa_perc_N', 'aa_perc_P',
       'aa_perc_Q', 'aa_perc_R', 'aa_perc_S', 'aa_perc_T', 'aa_perc_V',
       'aa_perc_W', 'aa_perc_Y', 'sasa', 'HeavyAA', 'LightAA', 'igfold_id',
       'HeavyAA_aligned', 'LightAA_aligned', 'full_seq'],
      dtype='object')

In [7]:
sasa_rewards = df.sasa
ss_perc_sheet_rewards = df.ss_perc_sheet
heavy_sequences = df.HeavyAA
light_sequences = df.LightAA

In [8]:
seq_reward_df = pd.DataFrame()

In [9]:
sequences = heavy_sequences
sasa_reward = sasa_rewards
ss_perc_sheet_reward = ss_perc_sheet_rewards

In [4]:
annotated_df = pd.read_csv("../nos_data/source_data/train_annotated_HL.csv")

In [11]:
annotated_df.columns

Index(['Id', 'HC', 'LC', 'hmm_species_x', 'v_gene_heavy', 'v_identity_heavy',
       'j_gene_heavy', 'j_identity_heavy', 'H_FR1', 'H_CDR1', 'H_FR2',
       'H_CDR2', 'H_FR3', 'H_CDR3', 'H_FR4', 'hmm_species_y', 'v_gene_light',
       'v_identity_light', 'j_gene_light', 'j_identity_light', 'L_FR1',
       'L_CDR1', 'L_FR2', 'L_CDR2', 'L_FR3', 'L_CDR3', 'L_FR4'],
      dtype='object')

In [12]:
heavy_chains = []
hcdr1, hcdr2, hcdr3 = [], [], []

light_chains = []
lcdr1, lcdr2, lcdr3 = [], [], []

for heavy, light in zip(heavy_sequences, light_sequences):
    try:
        tmp = annotated_df[(annotated_df.HC == heavy) & (annotated_df.LC == light)]
        heavy_chains.append(heavy)
        hcdr1.append(tmp.H_CDR1.values[0])
        hcdr2.append(tmp.H_CDR2.values[0])
        hcdr3.append(tmp.H_CDR3.values[0])        

        light_chains.append(light)
        lcdr1.append(tmp.L_CDR1.values[0])
        lcdr2.append(tmp.L_CDR2.values[0])
        lcdr3.append(tmp.L_CDR3.values[0])
    
    except:
        print("error")

In [13]:

all_data = pd.DataFrame(columns=["HC", "HCDR1", "HCDR2", "HCDR3", "LC", "LCDR1", "LCDR2", "LCDR3", "sasa", "ss_perc_sheet"])

all_data['HC'] = heavy_chains
all_data['HCDR1'] = hcdr1
all_data['HCDR2'] = hcdr2
all_data['HCDR3'] = hcdr3

all_data['LC'] = light_chains
all_data['LCDR1'] = lcdr1
all_data['LCDR2'] = lcdr2
all_data['LCDR3'] = lcdr3

all_data['sasa'] = np.array(sasa_rewards)
all_data['ss_perc_sheet'] = np.array(ss_perc_sheet_rewards)

all_data['norm_sasa'] = (np.array(sasa_rewards) - np.mean(np.array(sasa_rewards))) / np.std(np.array(sasa_rewards))
all_data['norm_ss_perc_sheet'] = (np.array(ss_perc_sheet_rewards) - np.mean(np.array(ss_perc_sheet_rewards))) / np.std(np.array(ss_perc_sheet_rewards))


training_df = all_data.sample(frac=0.9, random_state=42)
test_df = all_data.drop(training_df.index)


training_df_10k = training_df.sample(10000)

training_df.to_csv("training_data.csv", index=False)
training_df_10k.to_csv("training_data_10k.csv", index=False)

test_df.to_csv("eval_data.csv", index=False)
