## **Split Datasets into Training-Validation-Testing Fractions Based on Clusters**

Because the same gene/protein can have both damaging and benign mutations, we need to split the dataset blindly with respect to the effect of the mutation. Using the full set of variants, we split into **training/validation/testing** through their clusters to prevent homologue cross-contamination between the respective sets.

In [13]:
import os
import pandas as pd
from IPython.display import display

def count_categories(df: pd.DataFrame):
   benign = 0
   pathogenic = 0
   
   for row in df.itertuples():
      effect = getattr(row, 'category')
      if effect == 'LB/B':
         benign += 1
      elif effect == 'LP/P':
         pathogenic += 1
   
   print('benign: ', benign)
   print('pathogenic: ', pathogenic)

ROOT_DIR = os.environ.get('PYTHONPATH')

INFILE_PATH = os.path.join(ROOT_DIR, 'synced_files/all_variants/all_variants_nr.csv')

INFILE_DF = pd.read_csv(INFILE_PATH, sep='\t')

display(INFILE_DF)
count_categories(INFILE_DF)

Unnamed: 0,uniprot,category,cluster,position,WT,Mut
0,P04217,LB/B,A0A1U7UUV9,52,H,R
1,P04217,LB/B,A0A1U7UUV9,395,H,R
2,Q9NQ94,LB/B,V9KAZ0,555,V,M
3,Q9NQ94,LB/B,V9KAZ0,558,A,S
4,P01023,LB/B,A0A091S656,704,R,H
...,...,...,...,...,...,...
133895,Q9Y6X0,LP/P,W5KM97,874,D,G
133896,Q9Y6X9,LP/P,A0A226P8Q3,98,G,R
133897,Q9Y6X9,LP/P,A0A226P8Q3,319,R,H
133898,Q9Y6Y1,LP/P,K7DY74,955,R,W


benign:  82813
pathogenic:  51087


### **Get The Row Index of Every Cluster**

In [14]:
from collections import defaultdict

CLUSTER_IDXS = defaultdict(list)

for row in INFILE_DF.itertuples():
   
   cluster = getattr(row, 'cluster')
   row_idx = int(getattr(row, 'Index'))
   
   CLUSTER_IDXS[cluster].append(row_idx)
   

### **Get A List of All Cluster Seeds**

In [15]:
CLUSTER_SEEDS = list(CLUSTER_IDXS.keys())

print(len(CLUSTER_SEEDS))

12734


### **Split Clusters into Train/Validation/Test**

In [23]:
from sklearn.model_selection import train_test_split

def train_val_test(data, train: float, val: float, test: float):
   
   if train + val + test != 1:
      raise Exception('train + validation + test needs to add up tp 1')
   
   training, test_val = train_test_split(data, test_size=(1 - train))
   
   testSize = test/(val + test)
   
   validation, testing = train_test_split(test_val, test_size= testSize)
   
   return training, validation, testing

# clusters split into training, validation, and testing sets
training, validation, testing = train_val_test(CLUSTER_SEEDS, 0.7, 0.15, 0.15)

print(len(training))
print(len(validation))
print(len(testing))

8913
1910
1911


## **Get the Row Indices for Each Set and Extract into Separate Dataframes**

In [24]:
training_rows = [idx for indices in [CLUSTER_IDXS[seed] for seed in training] for idx in indices]
validation_rows = [idx for indices in [CLUSTER_IDXS[seed] for seed in validation] for idx in indices]
testing_rows = [idx for indices in [CLUSTER_IDXS[seed] for seed in testing] for idx in indices]

training_rows.sort()
validation_rows.sort()
testing_rows.sort()

training_df = INFILE_DF.iloc[training_rows]
validation_df = INFILE_DF.iloc[validation_rows]
testing_df = INFILE_DF.iloc[testing_rows]

# check how many LB/B and LP/P in each set   
display(training_df)
count_categories(training_df)
display(validation_df)
count_categories(validation_df)
display(testing_df)
count_categories(testing_df)

Unnamed: 0,uniprot,category,cluster,position,WT,Mut
0,P04217,LB/B,A0A1U7UUV9,52,H,R
1,P04217,LB/B,A0A1U7UUV9,395,H,R
2,Q9NQ94,LB/B,V9KAZ0,555,V,M
3,Q9NQ94,LB/B,V9KAZ0,558,A,S
4,P01023,LB/B,A0A091S656,704,R,H
...,...,...,...,...,...,...
133895,Q9Y6X0,LP/P,W5KM97,874,D,G
133896,Q9Y6X9,LP/P,A0A226P8Q3,98,G,R
133897,Q9Y6X9,LP/P,A0A226P8Q3,319,R,H
133898,Q9Y6Y1,LP/P,K7DY74,955,R,W


benign:  58391
pathogenic:  36729


Unnamed: 0,uniprot,category,cluster,position,WT,Mut
38,Q5VUY0,LB/B,A0A2Y9SUQ5,252,F,C
39,Q5VUY0,LB/B,A0A2Y9SUQ5,104,P,S
40,Q5VUY0,LB/B,A0A2Y9SUQ5,128,L,M
41,Q5VUY0,LB/B,A0A2Y9SUQ5,186,R,W
42,Q5VUY0,LB/B,A0A2Y9SUQ5,307,M,I
...,...,...,...,...,...,...
133854,Q9Y6H8,LP/P,A0A1S3GWX6,188,N,S
133855,Q9Y6H8,LP/P,A0A1S3GWX6,206,F,I
133887,Q9Y6N9,LP/P,A0A1S3PDH9,104,G,R
133888,Q9Y6N9,LP/P,A0A1S3PDH9,149,E,K


benign:  11898
pathogenic:  7548


Unnamed: 0,uniprot,category,cluster,position,WT,Mut
33,Q86V21,LB/B,A0A2E9W6V8,118,I,V
34,Q86V21,LB/B,A0A2E9W6V8,470,A,V
56,Q16613,LB/B,A0A1U7T3C6,15,R,C
492,Q9NRK6,LB/B,M3W846,150,A,S
493,Q9NRK6,LB/B,M3W846,545,D,N
...,...,...,...,...,...,...
133810,Q9Y619,LP/P,L5MK72,70,A,L
133811,Q9Y619,LP/P,L5MK72,131,K,T
133812,Q9Y619,LP/P,L5MK72,220,G,R
133848,Q9Y6H1,LP/P,A0A1L8HG60,145,R,Q


benign:  12524
pathogenic:  6810


## **Reformat 'change' Column into `position`, `WT`, and `Mut`**

In [21]:
import re

AA_DICT_LTS = {'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q',
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',
'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A',
'GLY':'G', 'PRO':'P', 'CYS':'C', 'SEC': 'U'}

cols_to_drop = ['gene_name', 'FTId', 'change', 'dbSNP']

def format_mutation(df: pd.DataFrame):
   
   new_cols = {
      'position': [],
      'WT': [],
      'Mut': []
   }
   
   for row in df.itertuples():
      
      mutation: str = getattr(row, 'change').replace('p.','')
      position = re.search(r'(\d+)',mutation).group(1)
      AA = mutation.split(position)
      WT = AA_DICT_LTS[AA[0].upper()]
      Mut = AA_DICT_LTS[AA[1].upper()]
      
      new_cols['position'].append(position)
      new_cols['WT'].append(WT)
      new_cols['Mut'].append(Mut)
      
   formatted_df = df.assign(**new_cols)
   
   return formatted_df

training_df = format_mutation(training_df).drop(columns=cols_to_drop).rename(columns={'AC': 'uniprot'})
validation_df = format_mutation(validation_df).drop(columns=cols_to_drop).rename(columns={'AC': 'uniprot'})
testing_df = format_mutation(testing_df).drop(columns=cols_to_drop).rename(columns={'AC': 'uniprot'})



Unnamed: 0,uniprot,category,disease_name,cluster,position,WT,Mut
0,P04217,LB/B,-,A0A1U7UUV9,52,H,R
1,P04217,LB/B,-,A0A1U7UUV9,395,H,R
2,Q9NQ94,LB/B,-,V9KAZ0,555,V,M
3,Q9NQ94,LB/B,-,V9KAZ0,558,A,S
4,P01023,LB/B,-,A0A091S656,704,R,H
...,...,...,...,...,...,...,...
72158,Q6ZVL8,LB/B,-,Q6ZVL8,105,P,S
72159,Q6ZVL8,LB/B,-,Q6ZVL8,136,C,S
72160,Q8N402,LB/B,-,G3QPU9,97,P,L
72161,Q8N402,LB/B,-,G3QPU9,114,T,S


Unnamed: 0,uniprot,category,disease_name,cluster,position,WT,Mut
46,Q2M2I8,LB/B,-,E7F2L1,509,K,Q
47,Q2M2I8,LB/B,-,E7F2L1,59,I,V
48,Q2M2I8,LB/B,-,E7F2L1,533,Q,H
49,Q2M2I8,LB/B,-,E7F2L1,603,V,A
50,Q2M2I8,LB/B,-,E7F2L1,694,T,M
...,...,...,...,...,...,...,...
72091,Q8IWY8,LB/B,-,I3LVD4,104,R,G
72121,O95229,LB/B,-,A0A2I3MH88,4,A,S
72122,O95229,LB/B,-,A0A2I3MH88,187,R,G
72165,Q9N2K0,LB/B,-,Q5G5C9,81,V,L


Unnamed: 0,uniprot,category,disease_name,cluster,position,WT,Mut
29,Q9NRG9,LP/P,Achalasia-addisonianism-alacrima syndrome (AAA...,A0A1D5PWG4,15,Q,K
30,Q9NRG9,LP/P,Achalasia-addisonianism-alacrima syndrome (AAA...,A0A1D5PWG4,160,H,R
31,Q9NRG9,LP/P,Achalasia-addisonianism-alacrima syndrome (AAA...,A0A1D5PWG4,263,S,P
32,Q9NRG9,LB/B,-,A0A1D5PWG4,108,K,M
44,Q8N5Z0,LB/B,-,I3NB42,243,V,I
...,...,...,...,...,...,...,...
72152,Q6ZTK2,LB/B,-,A0A151N189,345,V,I
72153,Q6ZTK2,LB/B,-,A0A151N189,368,T,A
72154,Q6ZTK2,LB/B,-,A0A151N189,419,W,R
72163,Q96M66,LB/B,-,A0A0D9R751,37,R,H


In [25]:
# output to csv

OUTPUT_PATH = os.path.join(ROOT_DIR, 'synced_files/all_variants/train_test_split')

training_df.to_csv(OUTPUT_PATH + '/all_variants_training.csv', sep='\t', index=False)
validation_df.to_csv(OUTPUT_PATH + '/all_variants_validation.csv', sep='\t', index=False)
testing_df.to_csv(OUTPUT_PATH + '/all_variants_testing.csv', sep='\t', index=False)

In [None]:
# output