In [9]:
import os
import pandas as pd
from IPython.display import display

HUMSAVAR_PATH = './humsavar_with_cluster.csv'
VEP_PATH = './vep_variants_combined_nr.csv'

HUMSAVAR_DF = pd.read_csv(HUMSAVAR_PATH, sep='\t')
VEP_DF = pd.read_csv(VEP_PATH, sep='\t')

display(HUMSAVAR_DF)
display(VEP_DF)

Unnamed: 0,gene_name,AC,FTId,change,category,dbSNP,disease_name,cluster
0,A1BG,P04217,VAR_018369,p.His52Arg,LB/B,rs893184,-,A0A1U7UUV9
1,A1BG,P04217,VAR_018370,p.His395Arg,LB/B,rs2241788,-,A0A1U7UUV9
2,A1CF,Q9NQ94,VAR_052201,p.Val555Met,LB/B,rs9073,-,V9KAZ0
3,A1CF,Q9NQ94,VAR_059821,p.Ala558Ser,LB/B,rs11817448,-,V9KAZ0
4,A2M,P01023,VAR_000012,p.Arg704His,LB/B,rs1800434,-,A0A091S656
...,...,...,...,...,...,...,...,...
72326,ZXDB,P98169,VAR_033005,p.Asp764Asn,LB/B,rs1057341,-,A0A1U7TCQ3
72327,ZXDB,P98169,VAR_033006,p.Thr791Arg,LB/B,rs1057343,-,A0A1U7TCQ3
72328,ZXDC,Q2QGD7,VAR_057464,p.Pro562Leu,LB/B,rs16837497,-,F6Z495
72329,ZYX,Q15942,VAR_034081,p.His223Leu,LB/B,rs11978404,-,H2RUI5


Unnamed: 0,uniprot,position,WT,Mut,cluster,category
0,A0A075B6H7,39,V,A,D2HJ94,LB/B
1,A0A075B6H7,55,T,S,D2HJ94,LB/B
2,A0A075B6H7,78,S,G,D2HJ94,LB/B
3,A0A075B6H8,27,T,I,A0A0H4LVB5,LB/B
4,A0A075B6I3,61,L,P,G1PYU6,LB/B
...,...,...,...,...,...,...
90078,Q9Y6X9,413,V,F,A0A226P8Q3,LP/P
90079,Q9Y6X9,431,A,V,A0A226P8Q3,LP/P
90080,Q9Y6Y1,955,R,W,K7DY74,LP/P
90081,Q9Y6Y1,1077,Y,C,K7DY74,LP/P


## STANDARDIZE HUMSAVAR COLUMNS

In [10]:
import re

AA_DICT_LTS = {'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q',
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',
'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A',
'GLY':'G', 'PRO':'P', 'CYS':'C', 'SEC': 'U'}

cols_to_drop = ['gene_name','FTId', 'change', 'dbSNP', 'disease_name']

def format_mutation(df: pd.DataFrame):
   
   new_cols = {
      'position': [],
      'WT': [],
      'Mut': []
   }
   
   for row in df.itertuples():
      
      mutation: str = getattr(row, 'change').replace('p.','')
      position = re.search(r'(\d+)',mutation).group(1)
      AA = mutation.split(position)
      WT = AA_DICT_LTS[AA[0].upper()]
      Mut = AA_DICT_LTS[AA[1].upper()]
      
      new_cols['position'].append(position)
      new_cols['WT'].append(WT)
      new_cols['Mut'].append(Mut)
      
   formatted_df = df.assign(**new_cols)
   
   return formatted_df

HUMSAVAR_DF = format_mutation(HUMSAVAR_DF).drop(columns=cols_to_drop).rename(columns={'AC': 'uniprot'})

# remove uncertain significance
US_mask = HUMSAVAR_DF['category'] != 'US'

HUMSAVAR_DF = HUMSAVAR_DF[US_mask].reset_index().drop(columns=['index'])

display(HUMSAVAR_DF)

Unnamed: 0,uniprot,category,cluster,position,WT,Mut
0,P04217,LB/B,A0A1U7UUV9,52,H,R
1,P04217,LB/B,A0A1U7UUV9,395,H,R
2,Q9NQ94,LB/B,V9KAZ0,555,V,M
3,Q9NQ94,LB/B,V9KAZ0,558,A,S
4,P01023,LB/B,A0A091S656,704,R,H
...,...,...,...,...,...,...
63324,P98169,LB/B,A0A1U7TCQ3,760,N,S
63325,P98169,LB/B,A0A1U7TCQ3,764,D,N
63326,P98169,LB/B,A0A1U7TCQ3,791,T,R
63327,Q2QGD7,LB/B,F6Z495,562,P,L


## COMBINE DATAFRAMES AND REMOVE DUPLICATES

In [15]:
# mark records with their origin databases
HUMSAVAR_DF['source'] = 'humsavar'
VEP_DF['source'] = 'uniprot_vep'


COMBINED_DF = pd.concat([HUMSAVAR_DF, VEP_DF])

print('COMBINED DF')
display(COMBINED_DF)

encountered_dict = {}

def generate_key(tup):
   return getattr(tup, 'uniprot') + str(getattr(tup, 'position')) + getattr(tup, 'WT') + getattr(tup, 'Mut')

boolMask = []

for row in COMBINED_DF.itertuples():
   key = generate_key(row)
   try:
      if encountered_dict[key] == True:
         boolMask.append(False)
   except KeyError:
      encountered_dict[key] = True
      boolMask.append(True)

NR_COMBINED_DF = COMBINED_DF[boolMask]

def count_categories(df: pd.DataFrame):
   benign = 0
   pathogenic = 0
   
   for row in df.itertuples():
      effect = getattr(row, 'category')
      if effect == 'LB/B':
         benign += 1
      elif effect == 'LP/P':
         pathogenic += 1
   
   print('benign: ', benign)
   print('pathogenic: ', pathogenic)

print('NON-REDUNDANT COMBINED DF')
count_categories(NR_COMBINED_DF)
display(NR_COMBINED_DF)

COMBINED DF


Unnamed: 0,uniprot,category,cluster,position,WT,Mut,source
0,P04217,LB/B,A0A1U7UUV9,52,H,R,humsavar
1,P04217,LB/B,A0A1U7UUV9,395,H,R,humsavar
2,Q9NQ94,LB/B,V9KAZ0,555,V,M,humsavar
3,Q9NQ94,LB/B,V9KAZ0,558,A,S,humsavar
4,P01023,LB/B,A0A091S656,704,R,H,humsavar
...,...,...,...,...,...,...,...
90078,Q9Y6X9,LP/P,A0A226P8Q3,413,V,F,uniprot_vep
90079,Q9Y6X9,LP/P,A0A226P8Q3,431,A,V,uniprot_vep
90080,Q9Y6Y1,LP/P,K7DY74,955,R,W,uniprot_vep
90081,Q9Y6Y1,LP/P,K7DY74,1077,Y,C,uniprot_vep


NON-REDUNDANT COMBINED DF
benign:  74614
pathogenic:  48342


Unnamed: 0,uniprot,category,cluster,position,WT,Mut,source
0,P04217,LB/B,A0A1U7UUV9,52,H,R,humsavar
1,P04217,LB/B,A0A1U7UUV9,395,H,R,humsavar
2,Q9NQ94,LB/B,V9KAZ0,555,V,M,humsavar
3,Q9NQ94,LB/B,V9KAZ0,558,A,S,humsavar
4,P01023,LB/B,A0A091S656,704,R,H,humsavar
...,...,...,...,...,...,...,...
90070,Q9Y6X0,LP/P,W5KM97,874,D,G,uniprot_vep
90073,Q9Y6X9,LP/P,A0A226P8Q3,98,G,R,uniprot_vep
90075,Q9Y6X9,LP/P,A0A226P8Q3,319,R,H,uniprot_vep
90080,Q9Y6Y1,LP/P,K7DY74,955,R,W,uniprot_vep


## OUTPUT TO CSV

In [14]:
NR_COMBINED_DF.to_csv('./all_variants_nr.csv', sep='\t', index=False)