In [180]:
import pandas as pd
import numpy as np
import os
from lxml import etree
import xml.etree.ElementTree as ET
import time
import datetime

In [101]:
aatranlation = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
                'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
                'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
                'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'}

In [102]:
def readHumanGenes(path):
    human_genes = []
    with open(path, 'r') as filehandle:
        human_genes = filehandle.read().splitlines()
    return human_genes

In [104]:
human_genes = readHumanGenes('../data/UniProtHumanEnzymeGenes.txt')
print(f'{len(human_genes)} number of genes are imported')

4312 number of genes are imported


In [160]:
class variationHandler(object):
    def __init__(self, enzyme_genes):
        self.dictlist = {'interpretation': [], 'gene':[], 'gene_name':[], 'accession':[], 'mutation': [], 'NP': [], 'Chr': [], 'start':[], 'stop':[], 'referenceAllele':[], 'alternateAllele':[]}
        self.enzyme_genes = enzyme_genes
        self.unnecessary_types = ('inversion', 'copy number gain', 'tandem duplication', 'microsatellite', 'copy number loss', 'distinct chromosomes', 'fusion', 'complex', 'duplication', 'translocation')
        self.is_type = False
        self.gene = ""
        self.gene_name = ""
        self.accession = ""
        self.mutation = ""
        self.np_num = ""
        self.change = ""
        self.chr = ""
        self.start_num = ""
        self.stop_num = ""
        self.referenceAllele = ""
        self.alternateAllele = ""
        self.is_GeneList = False
        self.ct_gene = 0
        self.check_grch = False
        self.ct_np = 0
        self.ct_mc = 0  # counter for Molecular Consequence tag
        self.is_haplotype = False  # check if a variation is haplotype or not
        self.is_genotype = False  # check if a variation is genotype or not
        self.is_missense = False
        self.is_conflicting = False
        self.is_not_provided = False
        self.is_interpretations = False
        self.is_interpretation = False
        self.is_description = False
        self.is_desc_hist = False
        self.intpn = []
        self.ct = 0
        self.ct_missense = 0
        self.ct_uncertain = 0
        self.ct_conflicting = 0
        self.ct_not_provided = 0
        
    def start(self, tag, attrs):
        if (tag == 'VariationArchive') and (attrs.get('VariationType').lower() not in self.unnecessary_types):
            self.is_type = True
            if attrs.get('VariationType').lower() == 'haplotype':
                self.is_haplotype = True
            if attrs.get('VariationType').lower() == 'compoundheterozygote':
                self.is_genotype = True
            self.accession = attrs.get('Accession')
        if self.is_type:
            if tag == 'GeneList':
                self.is_GeneList = True
            elif tag == 'Gene' and self.ct_gene == 0:
                self.gene = attrs.get('Symbol')
                self.gene_name = attrs.get('FullName')
                self.ct_gene += 1
            elif tag == 'SequenceLocation' and self.is_GeneList == False and self.check_grch == False:
                if attrs.get('Assembly') == 'GRCh38':
                    self.chr = attrs.get('Chr')
                    self.start_num = attrs.get('start')
                    self.stop_num = attrs.get('stop')
                    self.referenceAllele = attrs.get('referenceAlleleVCF')
                    self.alternateAllele = attrs.get('alternateAlleleVCF')
                    self.check_grch = True
            elif tag == 'ProteinExpression' and self.ct_np == 0:
                self.np_num = attrs.get('sequenceAccessionVersion')
                self.change = attrs.get('change') 
                if self.np_num and self.np_num.startswith('NP'):            
                    self.ct_np += 1
            elif tag == 'MolecularConsequence' and self.ct_mc == 0:
                if attrs.get('Type') and 'missense' in attrs.get('Type').lower():
                    self.is_missense = True
                    self.ct_missense += 1
                self.ct_mc += 1
            elif tag == 'Interpretations':
                self.is_interpretations = True
            elif tag == 'Interpretation':
                self.is_interpretation = True
            elif tag == 'Description':
                self.is_description = True
            elif tag == 'DescriptionHistory':
                self.is_desc_hist = True
            
    def end(self, tag):
        if (tag == 'VariationArchive' and self.is_type) or ((self.is_haplotype or self.is_genotype) and tag == 'SimpleAllele'):
            if len(self.intpn) == 1:
                interpretation = self.intpn[0].lower()
                if "uncertain" in interpretation:
                    self.is_uncertain = True
                    self.ct_uncertain += 1
                elif "conflicting" in interpretation:
                    self.is_conflicting = True
                    self.ct_conflicting += 1
                elif "not provided" in interpretation:
                    self.is_not_provided = True
                    self.ct_not_provided += 1
            elif len(self.intpn) > 1:
                print(f'Interpretaion: {self.intpn}, Accession: {self.accession}, count: {self.ct}')
            
            if (self.gene in self.enzyme_genes) and self.is_missense and (self.is_uncertain or self.is_conflicting or self.is_not_provided):
                try:
                    self.change = self.change.split('p.')[1]
                    before = aatranlation.get(self.change[0:3])
                    after = aatranlation.get(self.change[len(self.change) - 3:len(self.change)])
                except: 
                    before = None
                    after = None
                if before and after:  # check if both have a value in aa dict
                    num = self.change[3:len(self.change) - 3]
                    abbreviated_change = before + num + after
                    fasta = np.nan
                    self.dictlist['interpretation'].append(interpretation)
                    self.dictlist['gene'].append(self.gene)
                    self.dictlist['gene_name'].append(self.gene_name)
                    self.dictlist['accession'].append(self.accession)
                    self.dictlist['mutation'].append(abbreviated_change)
                    self.dictlist['NP'].append(self.np_num)
                    self.dictlist['Chr'].append(self.chr)
                    self.dictlist['start'].append(self.start_num)
                    self.dictlist['stop'].append(self.stop_num)
                    self.dictlist['referenceAllele'].append(self.referenceAllele)
                    self.dictlist['alternateAllele'].append(self.alternateAllele)
            self.ct_gene = 0             
            self.check_grch = False
            self.is_missense = False
            self.is_uncertain = False
            self.is_conflicting = False
            self.is_not_provided = False
            self.ct_np = 0
            self.ct_mc = 0
            self.intpn = []
        if self.is_type:
            if tag == 'GeneList':
                self.is_GeneList = False
            elif tag == 'Interpretations':
                self.is_interpretations = False
            elif tag == 'Interpretation':
                self.is_interpretaion = False
            elif tag == 'Description':
                self.is_description = False
            elif tag == 'DescriptionHistory':
                self.is_desc_hist = False
        if tag == 'VariationArchive':
            self.is_type = False
            self.is_haplotype = False
            self.is_genotype = False
            self.ct +=1
            if self.ct % 10000 == 0:
                print(f'counter: {self.ct}')
                
    def data(self, data):
        if self.is_interpretations and self.is_interpretation and self.is_description and (not self.is_desc_hist):
            self.intpn.append(data)
            
    def close(self):
        print(f"Variations: {self.ct}")
        print(f"Uncertain Significance: {self.ct_uncertain}")
        print(f"Conflicting Report: {self.ct_conflicting}")
        print(f"Not Provided: {self.ct_not_provided}")
        print(f"Missense: {self.ct_missense}")
        print('debug: the file is closed')
        return self.dictlist

In [161]:
# read xml file of variations from ClinVar
# return dataframe and write to a csv file
def readClinVarVariationsXML(input_path, output_path, gene_set):
    print('debug: start parcing')
    parser = etree.XMLParser(target=variationHandler(gene_set))
    data = etree.parse(input_path, parser)
    df = pd.DataFrame(data)
    df.to_csv(output_path, index = False, header = True)
    return df

In [153]:
subnode = '../data/subnode.xml'
temp_path = '../data/temp.csv'
df_temp = readClinVarVariationsXML(subnode, temp_path, human_genes)
df_temp.head()

debug: start parcing
Interpretaion: ['Likely pathogenic', 'Pathogenic3', 'Pathogenic4'], Accession: VCV000101229, count: 0
Variations: 1
Uncertain Significance: 0
Conflicting Report: 0
Not Provided: 0
Missense: 1
debug: the file is closed


Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele


In [162]:
xmlfile = '../data/ClinVarVariationRelease_00-latest_weekly.xml'
out_path = '../data/MM_enzyme.csv'
df_0 = readClinVarVariationsXML(xmlfile, out_path, human_genes)
df_0.head()

debug: start parcing
counter: 10000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000012173, count: 12712
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000025511, count: 16218
counter: 20000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000132775, count: 27809
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000132776, count: 27810
counter: 30000
counter: 40000
counter: 50000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242384, count: 50634
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242385, count: 50635
Interpretaion: ['no interpretation for the sin

Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264658, count: 55788
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264659, count: 55789
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264660, count: 55790
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264668, count: 55791
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264669, count: 55792
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264672, count: 55793
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264675, count: 55794

Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000560198, count: 159068
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000560199, count: 159069
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000560201, count: 159070
counter: 160000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000617633, count: 166649
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000617662, count: 166656
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000617663, count: 166657
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV0

counter: 350000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242472, count: 351712
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242704, count: 351713
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242708, count: 351714
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242737, count: 351715
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242746, count: 351716
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242754, count: 351717
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV0

counter: 410000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000518442, count: 410319
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000561162, count: 412008
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000624606, count: 412890
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000375326, count: 419587
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000375327, count: 419588
counter: 420000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000631475, count: 421178
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'],

counter: 450000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000627560, count: 452763
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000627561, count: 452764
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000635090, count: 453985
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000635092, count: 453986
Interpretaion: ['Likely benign', 'no interpretation for the single variant', 'Uncertain significance'], Accession: VCV000684732, count: 456824
Interpretaion: ['no interpretation for the single variant', 'Benign', 'Likely pathogenic'], Accession: VCV000684735, count: 456825
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000684769, coun

counter: 480000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264664, count: 481538
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264674, count: 481857
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000267281, count: 481859
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000267282, count: 481860
counter: 490000
counter: 500000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000402119, count: 500320
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000430836, count: 502518
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the s

Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264663, count: 534792
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264666, count: 534793
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264667, count: 534794
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000268096, count: 534926
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000268097, count: 534927
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000393289, count: 539501
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000393292, count:

counter: 640000
Interpretaion: ['Uncertain significance', 'Pathogenic', 'Pathogenic'], Accession: VCV000549775, count: 646558
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000559943, count: 646561
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000559944, count: 646562
counter: 650000
counter: 660000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000818230, count: 662195
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000818231, count: 662196
counter: 670000
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000242721, count: 671735
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], A

Interpretaion: ['Uncertain significance', 'Pathogenic'], Accession: VCV000549768, count: 682189
Interpretaion: ['Conflicting interpretations of pathogenicity', 'Pathogenic', 'Pathogenic'], Accession: VCV000549784, count: 682193
Interpretaion: ['Pathogenic', 'no interpretation for the single variant', 'no interpretation for the single variant', 'Pathogenic'], Accession: VCV000549785, count: 682194
Interpretaion: ['Pathogenic', 'Uncertain significance', 'Pathogenic'], Accession: VCV000549786, count: 682195
Interpretaion: ['Pathogenic', 'Likely benign', 'Uncertain significance'], Accession: VCV000549787, count: 682196
Interpretaion: ['Pathogenic', 'no interpretation for the single variant', 'Pathogenic'], Accession: VCV000549791, count: 682198
Interpretaion: ['Conflicting interpretations of pathogenicity', 'Conflicting interpretations of pathogenicity', 'Uncertain significance'], Accession: VCV000549799, count: 682204
Interpretaion: ['Pathogenic/Likely pathogenic', 'Pathogenic/Likely path

Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000633943, count: 700716
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000633944, count: 700717
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000633945, count: 700718
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000633946, count: 700719
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000633947, count: 700720
Interpretaion: ['no inter

Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634044, count: 700817
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634045, count: 700818
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'no interpretation for the single variant', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634046, count: 700819
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634047, count: 700820
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug respo

Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634141, count: 700912
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634142, count: 700913
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant', 'no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634143, count: 700914
Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634144, count: 700915
Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634145, count: 700916
Interpretaion: ['drug res

Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634243, count: 701014
Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634244, count: 701015
Interpretaion: ['drug response', 'drug response', 'drug response'], Accession: VCV000634245, count: 701016
Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634246, count: 701017
Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634247, count: 701018
Interpretaion: ['drug response', 'drug response', 'drug response'], Accession: VCV000634248, count: 701019
Interpretaion: ['drug response', 'drug response', 'drug response'], Accession: VCV000634249, count: 701020
Interpretaion: ['drug response', 'drug response', 'drug response'], Accessi

Interpretaion: ['drug response', 'no interpretation for the single variant', 'no interpretation for the single variant', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634345, count: 701116
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634346, count: 701117
Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634347, count: 701118
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634348, count: 701119
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634349, count: 701120
Interpretaion: ['drug res

Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634403, count: 701750
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634404, count: 701751
Interpretaion: ['no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634405, count: 701752
Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634406, count: 701753
Interpretaion: ['drug response', 'no interpretation for the single variant', 'drug response', 'no interpretation for the single variant', 'drug response', 'drug response'], Accession: VCV000634407, count

Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000691964, count: 702171
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000830334, count: 704371
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000830335, count: 704372
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000830336, count: 704373
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000830337, count: 704374
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000830338, count: 704375
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000830339, count:

Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele
0,uncertain significance,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,VCV000000031,R97C,NP_612422.2,10,97598852,97598852,C,T
1,uncertain significance,AGA,aspartylglucosaminidase,VCV000000222,G60D,NP_000018.2,4,177440375,177440375,C,T
2,uncertain significance,DPYD,dihydropyrimidine dehydrogenase,VCV000000437,R886H,NP_000101.2,1,97098598,97098598,C,T
3,uncertain significance,PTS,6-pyruvoyltetrahydropterin synthase,VCV000000477,R16C,NP_000308.1,11,112226489,112226489,C,T
4,uncertain significance,PROC,"protein C, inactivator of coagulation factors ...",VCV000000661,P210L,NP_000303.1,2,127426178,127426178,C,T


In [141]:
class variationHandlerSpecific(object):
    def __init__(self, accession):
        self.is_accession = False
        self.accession = accession
        self.ct = 0
        print(self.accession)
        
    def start(self, tag, attrs):
        global WFILE
        if (tag == 'VariationArchive') and (attrs.get('Accession') == self.accession):
            self.is_accession = True
            print('The specific variation is found: ' + str(self.ct))
        if self.is_accession:
            if len(attrs.keys()) == 0:
                WFILE.write('<' + tag)
            else:   
                for i, t in enumerate(attrs.keys()):
                    if i == 0:
                        WFILE.write('<' + tag + ' ')
                    elif i != len(attrs.keys()) - 1:
                        WFILE.write(t + '="' + attrs.get(t) + '"' + ' ')
                    else:
                        WFILE.write(t + '="' + attrs.get(t) + '"')
            WFILE.write('>')
            
    def end(self, tag):
        global WFILE
        if self.is_accession:
            WFILE.write('</' + tag + '>')
        if tag == 'VariationArchive':
            if self.ct % 10000 == 0:
                print(self.ct)
            self.ct += 1
        if self.is_accession and tag == 'VariationArchive':
            self.is_accession = False
            WFILE.close()
            print('The subnode file is completed')
            
    def data(self, data):
        global WFILE
        if data is not None:
            if self.is_accession and data != "":
                WFILE.write(data)
            
    def close(self):
        print('The xml file is closed')

In [142]:
# read xml file of variations from ClinVar
# return dataframe and write to a csv file
def readClinVarVariationsXMLSpecific(input_path, accession):
    print('Start parcing')
    parser = etree.XMLParser(target=variationHandlerSpecific(accession))
    etree.parse(input_path, parser)

In [147]:
WFILE = open('../data/subnode.xml', 'w')
xmlfile = '../data/ClinVarVariationRelease_00-latest_weekly.xml'
accession = 'VCV000101229'
readClinVarVariationsXMLSpecific(xmlfile, accession)

Start parcing
VCV000101229
0
10000
20000
The specific variation is found: 26310
The subnode file is completed
30000
The xml file is closed


KeyboardInterrupt: 

In [165]:
# makes dictionary of fasta sequences and np number 
# returns the dictionary
def makeDictOfFasta(dictpath):
    fasta_dict = {}
    for root, d_names, file_names in os.walk(dictpath):
        for filename in file_names:
            fname = os.path.join(root, filename)
            with open(fname) as f:
                print('opened a fasta file')
                np_num = ''
                sequence = ''
                for line in f:
                    if line[0] == '>':
                        if sequence != '':
                            fasta_dict[np_num] = sequence
                            np_num = ''
                            sequence = ''                    
                        i = 1
                        while line[i] != ' ':
                            np_num += line[i]
                            i += 1
                    else:
                        line = line.strip('\n')
                        sequence += line
    print(f'length of fasta dictionary: {len(fasta_dict)}')
    return fasta_dict

In [174]:
df_0 = pd.read_csv('../data/MM_enzyme.csv')
df_0.head()

Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele
0,uncertain significance,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,VCV000000031,R97C,NP_612422.2,10,97598852,97598852,C,T
1,uncertain significance,AGA,aspartylglucosaminidase,VCV000000222,G60D,NP_000018.2,4,177440375,177440375,C,T
2,uncertain significance,DPYD,dihydropyrimidine dehydrogenase,VCV000000437,R886H,NP_000101.2,1,97098598,97098598,C,T
3,uncertain significance,PTS,6-pyruvoyltetrahydropterin synthase,VCV000000477,R16C,NP_000308.1,11,112226489,112226489,C,T
4,uncertain significance,PROC,"protein C, inactivator of coagulation factors ...",VCV000000661,P210L,NP_000303.1,2,127426178,127426178,C,T


In [167]:
fasta_dict = makeDictOfFasta('../fasta_sequences/')

opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
length of fasta dictionary: 113602


In [168]:
# crops fasta sequence
# returns the cropped sequnece with a specified range
def cropFASTA(sequence, location, reference, seqRange):
    if location - 1 < len(sequence) and sequence[location - 1] == reference:
        proteinSeq = sequence[0 if location - 1 - seqRange <= 0 else location - 1 - seqRange : location + seqRange]
        return proteinSeq
    else:
        return None

In [169]:
def addFASTAfromDict(fasta_dict, df):
    none_acc = []
    seq_list = []
    for index, row in df.iterrows():
        mutation = row['mutation']
        try:
            ref = mutation[0]
            location = int(mutation[1:len(mutation)-1])
            np_num = row['NP']  # specify the column of np 
            sequence = fasta_dict.get(np_num)
            seqRange = 10  # range of sequences to take
            seq = cropFASTA(sequence, location, ref, seqRange) if sequence else None
        except:
            seq = None
            accession = row['accession']
            none_acc.append(accession)
        seq_list.append(seq)
    df['FASTA'] = seq_list
    print(f'Unfound Sequences: {len(none_acc)} {none_acc}')
    return df

In [175]:
df_1 = addFASTAfromDict(fasta_dict, df_0)
df_1.to_csv('../data/MM_enzyme.csv', index = False, header = True)
df_1.head()

Unfound Sequences: 43 ['VCV000134028', 'VCV000134108', 'VCV000134670', 'VCV000288381', 'VCV000418418', 'VCV000474413', 'VCV000490097', 'VCV000503859', 'VCV000531064', 'VCV000541685', 'VCV000582159', 'VCV000583086', 'VCV000591262', 'VCV000591672', 'VCV000591831', 'VCV000594518', 'VCV000598475', 'VCV000524323', 'VCV000437115', 'VCV000418344', 'VCV000464734', 'VCV000566152', 'VCV000800863', 'VCV000133712', 'VCV000134623', 'VCV000537105', 'VCV000572650', 'VCV000625941', 'VCV000630885', 'VCV000650338', 'VCV000650666', 'VCV000658054', 'VCV000664737', 'VCV000411042', 'VCV000412448', 'VCV000181879', 'VCV000053180', 'VCV000428273', 'VCV000453687', 'VCV000481269', 'VCV000484704', 'VCV000589537', 'VCV000827598']


Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele,FASTA
0,uncertain significance,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,VCV000000031,R97C,NP_612422.2,10,97598852,97598852,C,T,SERLEVVSRVRQAMPKNRLLL
1,uncertain significance,AGA,aspartylglucosaminidase,VCV000000222,G60D,NP_000018.2,4,177440375,177440375,C,T,GGSALDAVESGCAMCEREQCD
2,uncertain significance,DPYD,dihydropyrimidine dehydrogenase,VCV000000437,R886H,NP_000101.2,1,97098598,97098598,C,T,LPSFGPYLEQRKKIIAENKIR
3,uncertain significance,PTS,6-pyruvoyltetrahydropterin synthase,VCV000000477,R16C,NP_000308.1,11,112226489,112226489,C,T,GGRRCQAQVSRRISFSASHRL
4,uncertain significance,PROC,"protein C, inactivator of coagulation factors ...",VCV000000661,P210L,NP_000303.1,2,127426178,127426178,C,T,DTEDQEDQVDPRLIDGKMTRR


In [171]:
df_1[['NP', 'gene', 'gene_name', 'FASTA']].head()

Unnamed: 0,NP,gene,gene_name,FASTA
0,NP_612422.2,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,SERLEVVSRVRQAMPKNRLLL
1,NP_000018.2,AGA,aspartylglucosaminidase,GGSALDAVESGCAMCEREQCD
2,NP_000101.2,DPYD,dihydropyrimidine dehydrogenase,LPSFGPYLEQRKKIIAENKIR
3,NP_000308.1,PTS,6-pyruvoyltetrahydropterin synthase,GGRRCQAQVSRRISFSASHRL
4,NP_000303.1,PROC,"protein C, inactivator of coagulation factors ...",DTEDQEDQVDPRLIDGKMTRR


In [172]:
# make FASTA format text file from dataframe for blast search
def makeFASTAfile(df, output_path):
    subset = df[['NP', 'gene', 'gene_name', 'FASTA']]
    tuples = [tuple(x) for x in subset.values]
    with open(output_path, 'w') as f:
        for tup in tuples:
            line = '>' + '\t'.join(tup[0:3]) + '\n'
            fasta = str(tup[3]) + '\n'
            f.write(line)
            f.write(fasta) 

In [173]:
makeFASTAfile(df_1, '../data/fasta.txt')

In [176]:
def blastLocal(fasta_path, out_path, evalue=10.0, window_size=3):
    cmd1 = '../ncbi/blast/bin/'
    cmd2 = 'blastp' + ' '\
         + '-query ' + fasta_path + ' '\
         + '-db ' + cmd1 + 'pdbaa' + ' '\
         + '-evalue ' + str(evalue) + ' '\
         + '-outfmt ' + '5' + ' '\
         + '-out ' + out_path
    cmd = cmd1 + cmd2
    b_cmd = os.system(cmd)
    print(cmd + ' : ran with exit code %d' %b_cmd)

In [181]:
start = datetime.datetime.now()

fasta_path = '../data/fasta.txt'  # path for the fasta file
out_path = '../data/blast_result.xml'  # path for the output file
evalue = 10.0
size = 3
blastLocal(fasta_path, out_path, evalue, size)

end = datetime.datetime.now()
time = end - start
c = divmod(time.days * 86400 + time.seconds, 60)
print(c)

../ncbi/blast/bin/blastp -query ../data/fasta_10.txt -db ../ncbi/blast/bin/pdbaa -evalue 10.0 -outfmt 5 -out ../data/myOutput.xml : ran with exit code 0
(0, 0)


In [None]:
class BlastHandler(object):
    def __init__(self):
        self.dictlist = {'pdb': [], 'evalue': [], 'hit_from': [], 'hit_to': []}
        self.is_hit_id = False
        self.is_evalue = False
        self.is_hit_from = False
        self.is_hit_to = False
        self.ct_iter = 0
        self.ct = 0
        
    def start(self, tag, attrs):
        if tag == 'Hit':
            self.ct_iter += 1
        if self.ct_iter == 1:
            if tag == 'Hit_id':
                self.is_hit_id = True
            elif tag == 'Hsp_evalue':
                self.is_evalue = True
            elif tag == 'Hsp_hit-from':
                self.is_hit_from = True
            elif tag == 'Hsp_hit-to':
                self.is_hit_to = True
            
    def end(self):
        if tag == 'Iteration':
            self.ct_iter = 0
            self.ct += 1
            if ct % 10000 == 0:
                print(f'coutner: {self.ct}')
        elif tag == 'Hit_id':
            self.is_hit_id = False
        elif tag == 'Hsp_evalue':
            self.is_evalue = False
            elif tag == 'Hsp_hit-from':
                self.is_hit_from = False
            elif tag == 'Hsp_hit-to':
                self.is_hit_to = False
        
    def data(self, data):
        if self.ct_iter == 1:
            if self.is_hit_id:
                self.dictlist['pdb'] = data
            elif self.is_evalue:
                self.dictlist['evalue'] = data
            elif self.is_hit_from:
                self.dictlist['hit_from'] = data
            elif self.is_hit_to:
                self.dictlist['hit_to'] = data