## **Split Datasets into Training-Validation-Testing Fractions Based on Clusters**

Because the same gene/protein can have both damaging and benign mutations, we need to split the dataset blindly with respect to the effect of the mutation. Using the full set of variants, we split into **training/validation/testing** through their clusters to prevent homologue cross-contamination between the respective sets.

In [66]:
import os
import pandas as pd
from IPython.display import display

def count_categories(df: pd.DataFrame):
   benign = 0
   pathogenic = 0
   
   for row in df.itertuples():
      effect = getattr(row, 'category')
      if effect == 'LB/B':
         benign += 1
      elif effect == 'LP/P':
         pathogenic += 1
   
   print('benign: ', benign)
   print('pathogenic: ', pathogenic)

ROOT_DIR = os.environ.get('PYTHONPATH')

INFILE_PATH = os.path.join(ROOT_DIR, 'synced_files/all_variants/all_variants_nr.csv')

INFILE_DF = pd.read_csv(INFILE_PATH, sep='\t')

display(INFILE_DF)
count_categories(INFILE_DF)

Unnamed: 0,uniprot,category,cluster,position,WT,Mut,source
0,P04217,LB/B,A0A1U7UUV9,52,H,R,humsavar
1,P04217,LB/B,A0A1U7UUV9,395,H,R,humsavar
2,Q9NQ94,LB/B,V9KAZ0,555,V,M,humsavar
3,Q9NQ94,LB/B,V9KAZ0,558,A,S,humsavar
4,P01023,LB/B,A0A091S656,704,R,H,humsavar
...,...,...,...,...,...,...,...
122951,Q9Y6X0,LP/P,W5KM97,874,D,G,uniprot_vep
122952,Q9Y6X9,LP/P,A0A226P8Q3,98,G,R,uniprot_vep
122953,Q9Y6X9,LP/P,A0A226P8Q3,319,R,H,uniprot_vep
122954,Q9Y6Y1,LP/P,K7DY74,955,R,W,uniprot_vep


benign:  74614
pathogenic:  48342


### **Get The Row Index of Every Cluster**

In [67]:
from collections import defaultdict

CLUSTER_IDXS = defaultdict(list)

for row in INFILE_DF.itertuples():
   
   cluster = getattr(row, 'cluster')
   row_idx = int(getattr(row, 'Index'))
   
   CLUSTER_IDXS[cluster].append(row_idx)
   

### **Get A List of All Cluster Seeds**

In [68]:
CLUSTER_SEEDS = list(CLUSTER_IDXS.keys())

print(len(CLUSTER_SEEDS))

11494


### **Split Clusters into Train/Validation/Test**

In [77]:
from sklearn.model_selection import train_test_split

def train_val_test(data, train: float, val: float, test: float):
   
   if train + val + test != 1:
      raise Exception('train + validation + test needs to add up tp 1')
   
   training, test_val = train_test_split(data, test_size=(1 - train))
   
   testSize = test/(val + test)
   
   validation, testing = train_test_split(test_val, test_size= testSize)
   
   return training, validation, testing

# clusters split into training, validation, and testing sets
training, validation, testing = train_val_test(CLUSTER_SEEDS, 0.7, 0.15, 0.15)

print(len(training))
print(len(validation))
print(len(testing))

8045
1724
1725


## **Get the Row Indices for Each Set and Extract into Separate Dataframes**

In [78]:
training_rows = [idx for indices in [CLUSTER_IDXS[seed] for seed in training] for idx in indices]
validation_rows = [idx for indices in [CLUSTER_IDXS[seed] for seed in validation] for idx in indices]
testing_rows = [idx for indices in [CLUSTER_IDXS[seed] for seed in testing] for idx in indices]

training_rows.sort()
validation_rows.sort()
testing_rows.sort()

training_df = INFILE_DF.iloc[training_rows]
validation_df = INFILE_DF.iloc[validation_rows]
testing_df = INFILE_DF.iloc[testing_rows]

# check how many LB/B and LP/P in each set   
display(training_df)
count_categories(training_df)
display(validation_df)
count_categories(validation_df)
display(testing_df)
count_categories(testing_df)

Unnamed: 0,uniprot,category,cluster,position,WT,Mut,source
0,P04217,LB/B,A0A1U7UUV9,52,H,R,humsavar
1,P04217,LB/B,A0A1U7UUV9,395,H,R,humsavar
4,P01023,LB/B,A0A091S656,704,R,H,humsavar
5,P01023,LB/B,A0A091S656,972,C,Y,humsavar
6,P01023,LB/B,A0A091S656,1000,I,V,humsavar
...,...,...,...,...,...,...,...
122938,Q9Y6K9,LP/P,A0A212D8P4,157,Q,P,uniprot_vep
122939,Q9Y6K9,LP/P,A0A212D8P4,166,E,V,uniprot_vep
122940,Q9Y6N7,LP/P,S7MVD6,176,P,S,uniprot_vep
122941,Q9Y6N7,LP/P,S7MVD6,1522,S,L,uniprot_vep


benign:  51372
pathogenic:  33184


Unnamed: 0,uniprot,category,cluster,position,WT,Mut,source
81,P80404,LP/P,A0A2J8S0A4,220,R,K,humsavar
82,P80404,LB/B,A0A2J8S0A4,56,Q,R,humsavar
413,Q8IZY2,LB/B,F7DJZ7,188,E,G,humsavar
414,Q8IZY2,LB/B,F7DJZ7,319,T,A,humsavar
415,Q8IZY2,LB/B,F7DJZ7,395,H,R,humsavar
...,...,...,...,...,...,...,...
122951,Q9Y6X0,LP/P,W5KM97,874,D,G,uniprot_vep
122952,Q9Y6X9,LP/P,A0A226P8Q3,98,G,R,uniprot_vep
122953,Q9Y6X9,LP/P,A0A226P8Q3,319,R,H,uniprot_vep
122954,Q9Y6Y1,LP/P,K7DY74,955,R,W,uniprot_vep


benign:  11882
pathogenic:  7563


Unnamed: 0,uniprot,category,cluster,position,WT,Mut,source
2,Q9NQ94,LB/B,V9KAZ0,555,V,M,humsavar
3,Q9NQ94,LB/B,V9KAZ0,558,A,S,humsavar
83,O95477,LB/B,N1PB02,399,V,A,humsavar
84,O95477,LP/P,N1PB02,587,R,W,humsavar
85,O95477,LP/P,N1PB02,590,W,S,humsavar
...,...,...,...,...,...,...,...
122852,Q9Y5U8,LP/P,A0A182P2E7,72,K,E,uniprot_vep
122942,Q9Y6N9,LP/P,A0A1S3PDH9,31,R,Q,uniprot_vep
122943,Q9Y6N9,LP/P,A0A1S3PDH9,104,G,R,uniprot_vep
122944,Q9Y6N9,LP/P,A0A1S3PDH9,149,E,K,uniprot_vep


benign:  11360
pathogenic:  7595


## **Reformat 'change' Column into `position`, `WT`, and `Mut`**

In [50]:
import re

AA_DICT_LTS = {'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q',
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',
'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A',
'GLY':'G', 'PRO':'P', 'CYS':'C', 'SEC': 'U'}

cols_to_drop = ['gene_name', 'FTId', 'change', 'dbSNP']

def format_mutation(df: pd.DataFrame):
   
   new_cols = {
      'position': [],
      'WT': [],
      'Mut': []
   }
   
   for row in df.itertuples():
      
      mutation: str = getattr(row, 'change').replace('p.','')
      position = re.search(r'(\d+)',mutation).group(1)
      AA = mutation.split(position)
      WT = AA_DICT_LTS[AA[0].upper()]
      Mut = AA_DICT_LTS[AA[1].upper()]
      
      new_cols['position'].append(position)
      new_cols['WT'].append(WT)
      new_cols['Mut'].append(Mut)
      
   formatted_df = df.assign(**new_cols)
   
   return formatted_df

training_df = format_mutation(training_df).drop(columns=cols_to_drop).rename(columns={'AC': 'uniprot'})
validation_df = format_mutation(validation_df).drop(columns=cols_to_drop).rename(columns={'AC': 'uniprot'})
testing_df = format_mutation(testing_df).drop(columns=cols_to_drop).rename(columns={'AC': 'uniprot'})



AttributeError: 'Pandas' object has no attribute 'change'

In [65]:
# output to csv

OUTPUT_PATH = os.path.join(ROOT_DIR, 'synced_files/all_variants/train_test_split')

training_df.to_csv(OUTPUT_PATH + '/all_variants_training.csv', sep='\t', index=False)
validation_df.to_csv(OUTPUT_PATH + '/all_variants_validation.csv', sep='\t', index=False)
testing_df.to_csv(OUTPUT_PATH + '/all_variants_testing.csv', sep='\t', index=False)

In [None]:
# output