In [1]:
import pandas as pd
import numpy as np
import os
from lxml import etree
import xml.etree.ElementTree as ET

In [2]:
aatranlation = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
                'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
                'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
                'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'}

In [3]:
def readHumanGenes(path):
    human_genes = []
    with open(path, 'r') as filehandle:
        human_genes = filehandle.read().splitlines()
    return human_genes

In [4]:
human_genes = readHumanGenes('../data/UniProtHumanEnzymeGenes.txt')
print(human_genes)

['CAMKMT', 'CSNK2A3', 'CPA6', 'AGA', 'CTDSPL', 'CERS5', 'CERS6', 'CDKAL1', 'CYP46A1', 'CYP4A11', 'ADH1B', 'ATP2A1', 'ATP10D', 'CDK5', 'B4GALT4', 'CDK6', 'CDC42', 'ADAM8', 'ARSA', 'CDK7', 'ASAH2', 'ADAMTS3', 'NT5C2', 'ATP11C', 'CTSB', 'CDK8', 'ADAMTS19', 'ATP10B', 'CHPF2', 'AFG1L', 'ADAMTS20', 'ATXN3L', 'ATP23', 'CDKL1', 'ACAD8', 'CDKL3', 'ATP2B2', 'CDKL4', 'ACVR1C', 'ACVRL1', 'C1R', 'ADH4', 'ACY1', 'ACOX1', 'ADAM28', 'B3GNTL1', 'RNPEP', 'ADTRP', 'ALKBH5', 'CDK9', 'ALG12', 'PRSS22', 'ALDH4A1', 'CNOT8', 'ATP10A', 'CYP7A1', 'ADH7', 'ALG1L', 'CAPN9', 'ASAH1', 'CTH', 'AAK1', 'ACLY', 'NT5C3A', 'APOBEC3F', 'ALG2', 'ATP11A', 'ATP11B', 'ALDH7A1', 'ABCD1', 'ARSB', 'ATG10', 'ADO', 'NT5E', 'ATP12A', 'ATXN3', 'GLB1', 'SMPDL3B', 'PRMT2', 'CASP4', 'BLVRA', 'CTSC', 'CDKN3', 'AADAT', 'ANPEP', 'AICDA', 'CASP10', 'ARIH1', 'BAAT', 'CDO1', 'CYP2D7', 'CHAC2', 'ALDH3B1', 'ADH6', 'BCO1', 'BCDIN3D', 'APOBEC3B', 'NT5C1B', 'BST1', 'BCO2', 'ALG1', 'B4GALT3', 'CNP', 'APOBEC3D', 'ADH1C', 'ABHD15', 'B4GALT7', 'ANKIB

In [5]:
class variationHandler(object):
    def __init__(self, enzyme_genes):
        self.dictlist = {'interpretation': [], 'gene':[], 'gene_name':[], 'accession':[], 'mutation': [], 'NP': [], 'Chr': [], 'start':[], 'stop':[], 'referenceAllele':[], 'alternateAllele':[]}
        self.enzyme_genes = enzyme_genes
        self.gene = ""
        self.gene_name = ""
        self.accession = ""
        self.mutation = ""
        self.np_num = ""
        self.change = ""
        self.chr = ""
        self.start_num = ""
        self.stop_num = ""
        self.referenceAllele = ""
        self.alternateAllele = ""
        self.is_GeneList = False
        self.ct_gene = 0
        self.check_grch = False
        self.ct_np = 0
        self.ct_mc = 0  # counter for Molecular Consequence tag
        self.is_haplotype = False  # check if a variation is haplotype or not
        self.is_missense = False
        self.is_conflicting = False
        self.is_not_provided = False
        self.is_interpretations = False
        self.is_interpretation = False
        self.is_description = False
        self.is_desc_hist = False
        self.intpn = []
        self.ct = 0
        self.ct_missense = 0
        self.ct_uncertain = 0
        self.ct_conflicting = 0
        self.ct_not_provided = 0
        
    def start(self, tag, attrs):
        if tag == 'VariationArchive':
            self.accession = attrs.get('Accession')
        elif tag == 'Haplotype':
            self.is_haplotype = True
        elif tag == 'GeneList':
            self.is_GeneList = True
        elif tag == 'Gene' and self.ct_gene == 0:
            self.gene = attrs.get('Symbol')
            self.gene_name = attrs.get('FullName')
            self.ct_gene += 1
        elif tag == 'SequenceLocation' and self.is_GeneList == False and self.check_grch == False:
            if attrs.get('Assembly') == 'GRCh38':
                self.chr = attrs.get('Chr')
                self.start_num = attrs.get('start')
                self.stop_num = attrs.get('stop')
                self.referenceAllele = attrs.get('referenceAlleleVCF')
                self.alternateAllele = attrs.get('alternateAlleleVCF')
                self.check_grch = True
        elif tag == 'ProteinExpression' and self.ct_np == 0:
            self.np_num = attrs.get('sequenceAccessionVersion')
            self.change = attrs.get('change') 
            if self.np_num and self.np_num.startswith('NP'):            
                self.ct_np += 1
        elif tag == 'MolecularConsequence' and self.ct_mc == 0:
            if attrs.get('Type') and 'missense' in attrs.get('Type').lower():
                self.is_missense = True
                self.ct_missense += 1
            self.ct_mc += 1
        elif tag == 'Interpretations':
            self.is_interpretations = True
        elif tag == 'Interpretation':
            self.is_interpretation = True
        elif tag == 'Description':
            self.is_description = True
        elif tag == 'DescriptionHistory':
            self.is_desc_hist = True
            
    def end(self, tag):
        if (tag == 'VariationArchive') or (self.is_haplotype and tag == 'SimpleAllele'):
        
            if len(self.intpn) == 1:
                interpretation = self.intpn[0].lower()
                if "uncertain" in interpretation:
                    self.is_uncertain = True
                    self.ct_uncertain += 1
                elif "conflicting" in interpretation:
                    self.is_conflicting = True
                    self.ct_conflicting += 1
                elif "not provided" in interpretation:
                    self.is_not_provided = True
                    self.ct_not_provided += 1
            elif len(self.intpn) > 1:
                print(f'Interpretaion: {self.intpn}, Accession: {self.accession}, count: {self.ct}')
            
            if (self.gene in self.enzyme_genes) and self.is_missense and (self.is_uncertain or self.is_conflicting or self.is_not_provided):
                try:
                    self.change = self.change.split('p.')[1]
                    before = aatranlation.get(self.change[0:3])
                    after = aatranlation.get(self.change[len(self.change) - 3:len(self.change)])
                except: 
                    before = None
                    after = None
                if before and after:  # check if both have a value in aa dict
                    num = self.change[3:len(self.change) - 3]
                    abbreviated_change = before + num + after
                    fasta = np.nan
                    self.dictlist['interpretation'].append(interpretation)
                    self.dictlist['gene'].append(self.gene)
                    self.dictlist['gene_name'].append(self.gene_name)
                    self.dictlist['accession'].append(self.accession)
                    self.dictlist['mutation'].append(abbreviated_change)
                    self.dictlist['NP'].append(self.np_num)
                    self.dictlist['Chr'].append(self.chr)
                    self.dictlist['start'].append(self.start_num)
                    self.dictlist['stop'].append(self.stop_num)
                    self.dictlist['referenceAllele'].append(self.referenceAllele)
                    self.dictlist['alternateAllele'].append(self.alternateAllele)
            self.is_haplotype = False
            self.ct_gene = 0             
            self.check_grch = False
            self.is_missense = False
            self.is_uncertain = False
            self.is_conflicting = False
            self.is_not_provided = False
            self.ct_np = 0
            self.ct_mc = 0
            self.intpn = []
            if tag == 'VariationArchive':
                self.ct +=1
                if self.ct % 10000 == 0:
                    print(f'counter: {self.ct}')
        elif tag == 'GeneList':
            self.is_GeneList = False
        elif tag == 'Interpretations':
            self.is_interpretations = False
        elif tag == 'Interpretation':
            self.is_interpretaion = False
        elif tag == 'Description':
            self.is_description = False
        elif tag == 'DescriptionHistory':
            self.is_desc_hist = False
                
    def data(self, data):
        if self.is_interpretations and self.is_interpretation and self.is_description and (not self.is_desc_hist):
            self.intpn.append(data)
            
    def close(self):
        print(f"Variations: {self.ct}")
        print(f"Uncertain Significance: {self.ct_uncertain}")
        print(f"Conflicting Report: {self.ct_conflicting}")
        print(f"Not Provided: {self.ct_not_provided}")
        print(f"Missense: {self.ct_missense}")
        print('debug: the file is closed')
        return self.dictlist

In [10]:
# read xml file of variations from ClinVar
# return dataframe and write to a csv file
def readClinVarVariationsXML(input_path, output_path, gene_set):
    print('debug: start parcing')
    parser = etree.XMLParser(target=variationHandler(gene_set))
    data = etree.parse(input_path, parser)
    df = pd.DataFrame(data)
    df.to_csv(output_path, index = False, header = True)
    return df

In [11]:
xmlfile = '../data/ClinVarVariationRelease_00-latest_weekly.xml'
out_path = '../data/MM_enzyme.csv'
df_0 = readClinVarVariationsXML(xmlfile, out_path, human_genes)
df_0.head()

debug: start parcing
Interpretaion: ['no interpretation for the single variant', 'Pathogenic'], Accession: VCV000001981, count: 9424
Interpretaion: ['no interpretation for the single variant', 'Pathogenic'], Accession: VCV000003667, count: 9968
counter: 10000
Interpretaion: ['no interpretation for the single variant', 'Pathogenic'], Accession: VCV000004368, count: 10192
Interpretaion: ['Benign', 'Pathogenic'], Accession: VCV000005706, count: 10611
Interpretaion: ['no interpretation for the single variant', 'Pathogenic, protective, risk factor'], Accession: VCV000008152, count: 11352
Interpretaion: ['no interpretation for the single variant', 'Pathogenic'], Accession: VCV000008241, count: 11384
Interpretaion: ['no interpretation for the single variant', 'Pathogenic'], Accession: VCV000008596, count: 11517
Interpretaion: ['no interpretation for the single variant', 'Pathogenic'], Accession: VCV000011158, count: 12432
Interpretaion: ['Likely benign', 'Pathogenic'], Accession: VCV000011906

Interpretaion: ['no interpretation for the single variant', 'Pathogenic'], Accession: VCV000253111, count: 51511
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264658, count: 55788
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264659, count: 55789
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264660, count: 55790
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264668, count: 55791
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264669, count: 55792
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Accession: VCV000264672, count: 55793
Interpretaion: ['no interpret

KeyboardInterrupt: 

In [82]:
class variationHandlerSpecific(object):
    def __init__(self, accession):
        self.is_accession = False
        self.accession = accession
        self.ct = 0
        self.ct_all = 0
        print(self.accession)
        
    def start(self, tag, attrs):
        if (tag == 'VariationArchive') and (attrs.get('Accession') == self.accession):
            self.is_accession = True
        if self.is_accession:
            print(f"{self.ct} tag: {tag}")
            self.ct += 1
            
    def end(self, tag):
        if self.is_accession:
            print(f"{self.ct} tag: /{tag}")
        if tag == 'VariationArchive':
            self.is_accession = False
            self.ct = 0
            if self.ct_all % 10000 == 0:
                print(self.ct_all)
            self.ct_all += 1
            
    def data(self, data):
        if self.is_accession and (data.strip()):
            print(f"    {self.ct} data: {data}")
            
    def close(self):
        print('debug: the xml file is closed')

In [83]:
# read xml file of variations from ClinVar
# return dataframe and write to a csv file
def readClinVarVariationsXMLSpecific(input_path, accession):
    print('debug: start parcing')
    parser = etree.XMLParser(target=variationHandlerSpecific(accession))
    data = etree.parse(input_path, parser)
    df = pd.DataFrame(data)
    return df

In [84]:
xmlfile = '../data/ClinVarVariationRelease_00-latest_weekly.xml'
accession = 'VCV000549768'
readClinVarVariationsXMLSpecific(xmlfile, accession)

debug: start parcing
VCV000549768
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
0 tag: VariationArchive
1 tag: RecordStatus
    2 data: current
2 tag: /RecordStatus
2 tag: Species
    3 data: Homo sapiens
3 tag: /Species
3 tag: InterpretedRecord
4 tag: Genotype
5 tag: SimpleAllele
6 tag: GeneList
7 tag: Gene
8 tag: Location
9 tag: CytogeneticLocation
    10 data: 10q11.21
10 tag: /CytogeneticLocation
10 tag: SequenceLocation
11 tag: /SequenceLocation
11 tag: SequenceLocation
12 tag: /SequenceLocation
12 tag: /Location
12 tag: OMIM
    13 data: 164761
13 tag: /OMIM
13 tag: Haplo

708 tag: ElementValue
    709 data: Hereditary cancer-predisposing syndrome
709 tag: /ElementValue
709 tag: XRef
710 tag: /XRef
710 tag: /Name
710 tag: Name
711 tag: ElementValue
    712 data: Tumor predisposition
712 tag: /ElementValue
712 tag: /Name
712 tag: Name
713 tag: ElementValue
    714 data: Cancer predisposition
714 tag: /ElementValue
714 tag: XRef
715 tag: /XRef
715 tag: /Name
715 tag: AttributeSet
716 tag: Attribute
    717 data: Neoplasm
717 tag: /Attribute
717 tag: /AttributeSet
717 tag: AttributeSet
718 tag: Attribute
    719 data: Hereditary cancer syndrome
719 tag: /Attribute
719 tag: /AttributeSet
719 tag: Citation
720 tag: ID
    721 data: 3075918
721 tag: /ID
721 tag: /Citation
721 tag: Citation
722 tag: ID
    723 data: 25394175
723 tag: /ID
723 tag: /Citation
723 tag: XRef
724 tag: /XRef
724 tag: /Trait
724 tag: /TraitSet
724 tag: TraitSet
725 tag: Trait
726 tag: Name
727 tag: ElementValue
    728 data: AllHighlyPenetrant
728 tag: /ElementValue
728 tag: /Name
728 

989 tag: /XRef
989 tag: XRef
990 tag: /XRef
990 tag: XRef
991 tag: /XRef
991 tag: XRef
992 tag: /XRef
992 tag: /Trait
992 tag: /TraitSet
992 tag: TraitSet
993 tag: Trait
994 tag: Name
995 tag: ElementValue
    996 data: Pheochromocytoma
996 tag: /ElementValue
996 tag: XRef
997 tag: /XRef
997 tag: XRef
998 tag: /XRef
998 tag: XRef
999 tag: /XRef
999 tag: /Name
999 tag: Name
1000 tag: ElementValue
    1001 data: Pheochromocytoma, somatic
1001 tag: /ElementValue
1001 tag: /Name
1001 tag: Name
1002 tag: ElementValue
    1003 data: MAX-Related Hereditary Paraganglioma-Pheochromocytoma Syndrome
1003 tag: /ElementValue
1003 tag: /Name
1003 tag: Name
1004 tag: ElementValue
    1005 data: Chromaffin tumors
1005 tag: /ElementValue
1005 tag: XRef
1006 tag: /XRef
1006 tag: /Name
1006 tag: AttributeSet
1007 tag: Attribute
    1008 data: Hereditary paraganglioma-pheochromocytoma (PGL/PCC) syndromes are characterized by paragangliomas (tumors that arise from neuroendocrine tissues distributed along t

1407 tag: /ProteinChange
1407 tag: HGVSlist
1408 tag: HGVS
1409 tag: NucleotideExpression
1410 tag: Expression
    1411 data: NC_000010.10:g.43596033G
    1411 data: >
    1411 data: A
1411 tag: /Expression
1411 tag: /NucleotideExpression
1411 tag: /HGVS
1411 tag: HGVS
1412 tag: NucleotideExpression
1413 tag: Expression
    1414 data: NC_000010.11:g.43100585G
    1414 data: >
    1414 data: A
1414 tag: /Expression
1414 tag: /NucleotideExpression
1414 tag: /HGVS
1414 tag: HGVS
1415 tag: NucleotideExpression
1416 tag: Expression
    1417 data: NM_020630.5:c.200G
    1417 data: >
    1417 data: A
1417 tag: /Expression
1417 tag: /NucleotideExpression
1417 tag: ProteinExpression
1418 tag: Expression
    1419 data: NP_065681.1:p.Arg67His
1419 tag: /Expression
1419 tag: /ProteinExpression
1419 tag: MolecularConsequence
1420 tag: /MolecularConsequence
1420 tag: /HGVS
1420 tag: HGVS
1421 tag: NucleotideExpression
1422 tag: Expression
    1423 data: NM_020975.6:c.200G
    1423 data: >
    1423 d

680000
debug: the xml file is closed


In [7]:
tree = etree.parse('../data/clinvarVariation_7.xml')
root = tree.getroot()
node = tree.find('VariationArchive[@Accession="VCV000012722"]')
mydata = etree.tostring(node)
myfile = open('../data/subnode.xml', 'wb')
myfile.write(mydata)

42977

In [8]:
# makes dictionary of fasta sequences and np number 
# returns the dictionary
def makeDictOfFasta(dictpath):
    fasta_dict = {}
    for root, d_names, file_names in os.walk(dictpath):
        for filename in file_names:
            fname = os.path.join(root, filename)
            with open(fname) as f:
                print('opened a fasta file')
                np_num = ''
                sequence = ''
                for line in f:
                    if line[0] == '>':
                        if sequence != '':
                            fasta_dict[np_num] = sequence
                            np_num = ''
                            sequence = ''                    
                        i = 1
                        while line[i] != ' ':
                            np_num += line[i]
                            i += 1
                    else:
                        line = line.strip('\n')
                        sequence += line
    print(f'length of fasta dictionary: {len(fasta_dict)}')
    return fasta_dict

In [13]:
df_0 = pd.read_csv('../data/MM_enzyme.csv')
df_0.head()

Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele,FASTA,PDB
0,uncertain significance,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,VCV000000031,R97C,NP_612422.2,10,97598852,97598852,C,T,,
1,uncertain significance,AGA,aspartylglucosaminidase,VCV000000222,G60D,NP_000018.2,4,177440375,177440375,C,T,,
2,uncertain significance,DPYD,dihydropyrimidine dehydrogenase,VCV000000437,R886H,NP_000101.2,1,97098598,97098598,C,T,,
3,uncertain significance,PTS,6-pyruvoyltetrahydropterin synthase,VCV000000477,R16C,NP_000308.1,11,112226489,112226489,C,T,,
4,uncertain significance,PROC,"protein C, inactivator of coagulation factors ...",VCV000000661,P210L,NP_000303.1,2,127426178,127426178,C,T,,


In [14]:
fasta_dict = makeDictOfFasta('../fasta_sequences/')

opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
length of fasta dictionary: 113602


In [15]:
# crops fasta sequence
# returns the cropped sequnece with a specified range
def cropFASTA(sequence, location, reference, seqRange):
    if location - 1 < len(sequence) and sequence[location - 1] == reference:
        proteinSeq = sequence[0 if location - 1 - seqRange <= 0 else location - 1 - seqRange : location + seqRange]
        return proteinSeq
    else:
        return None

In [16]:
def addFASTAfromDict(fasta_dict, df):
    none_acc = []
    seq_list = []
    for index, row in df.iterrows():
        mutation = row['mutation']
        try:
            ref = mutation[0]
            location = int(mutation[1:len(mutation)-1])
            np_num = row['NP']  # specify the column of np 
            sequence = fasta_dict.get(np_num)
            seqRange = 10  # range of sequences to take
            seq = cropFASTA(sequence, location, ref, seqRange) if sequence else None
        except:
            seq = None
            accession = row['accession']
            none_acc.append(accession)
        seq_list.append(seq)
    df['FASTA'] = seq_list
    print(f'Unfound Sequences: {len(none_acc)} {none_acc}')
    return df

In [17]:
df_1 = addFASTAfromDict(fasta_dict, df_0)
df_1.head()

Unfound Sequences: 46 ['VCV000134028', 'VCV000134108', 'VCV000134670', 'VCV000288381', 'VCV000418418', 'VCV000474413', 'VCV000490097', 'VCV000503859', 'VCV000531064', 'VCV000541685', 'VCV000582159', 'VCV000583086', 'VCV000591262', 'VCV000591672', 'VCV000591831', 'VCV000594518', 'VCV000598475', 'VCV000524323', 'VCV000437115', 'VCV000418344', 'VCV000464734', 'VCV000566152', 'VCV000800863', 'VCV000133712', 'VCV000134623', 'VCV000537105', 'VCV000572650', 'VCV000625941', 'VCV000630885', 'VCV000650338', 'VCV000650666', 'VCV000658054', 'VCV000664737', 'VCV000240738', 'VCV000411042', 'VCV000412448', 'VCV000181879', 'VCV000053180', 'VCV000216871', 'VCV000428273', 'VCV000453687', 'VCV000462579', 'VCV000481269', 'VCV000484704', 'VCV000589537', 'VCV000827598']


Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele,FASTA,PDB
0,uncertain significance,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,VCV000000031,R97C,NP_612422.2,10,97598852,97598852,C,T,SERLEVVSRVRQAMPKNRLLL,
1,uncertain significance,AGA,aspartylglucosaminidase,VCV000000222,G60D,NP_000018.2,4,177440375,177440375,C,T,GGSALDAVESGCAMCEREQCD,
2,uncertain significance,DPYD,dihydropyrimidine dehydrogenase,VCV000000437,R886H,NP_000101.2,1,97098598,97098598,C,T,LPSFGPYLEQRKKIIAENKIR,
3,uncertain significance,PTS,6-pyruvoyltetrahydropterin synthase,VCV000000477,R16C,NP_000308.1,11,112226489,112226489,C,T,GGRRCQAQVSRRISFSASHRL,
4,uncertain significance,PROC,"protein C, inactivator of coagulation factors ...",VCV000000661,P210L,NP_000303.1,2,127426178,127426178,C,T,DTEDQEDQVDPRLIDGKMTRR,


In [43]:
df_1[['NP', 'gene', 'gene_name', 'FASTA']].head()

Unnamed: 0,NP,gene,gene_name,FASTA
0,NP_612422.2,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,SERLEVVSRVRQAMPKNRLLL
1,NP_000018.2,AGA,aspartylglucosaminidase,GGSALDAVESGCAMCEREQCD
2,NP_000101.2,DPYD,dihydropyrimidine dehydrogenase,LPSFGPYLEQRKKIIAENKIR
3,NP_000308.1,PTS,6-pyruvoyltetrahydropterin synthase,GGRRCQAQVSRRISFSASHRL
4,NP_000303.1,PROC,"protein C, inactivator of coagulation factors ...",DTEDQEDQVDPRLIDGKMTRR


In [18]:
# make FASTA format text file from dataframe for blast search
def makeFASTAfile(df, output_path):
    subset = df[['NP', 'gene', 'gene_name', 'FASTA']]
    tuples = [tuple(x) for x in subset.values]
    with open(output_path, 'w') as f:
        for tup in tuples:
            line = '>' + '\t'.join(tup[0:3]) + '\n'
            fasta = str(tup[3]) + '\n'
            f.write(line)
            f.write(fasta) 

In [19]:
makeFASTAfile(df_1, '../data/fasta.txt')

In [52]:
def blastLocal(fasta_path, out_path, evalue=10.0, window_size=3):
    cmd1 = '../ncbi/blast/bin/'
    cmd2 = 'blastp' + ' '\
         + '-query ' + fasta_path + ' '\
         + '-db ' + cmd1 + 'pdbaa' + ' '\
         + '-evalue ' + str(evalue) + ' '\
         + '-outfmt ' + '5' + ' '\
         + '-out ' + out_path
    cmd = cmd1 + cmd2
    b_cmd = os.system(cmd)
    print(cmd + ' : ran with exit code %d' %b_cmd)

In [53]:
fasta_path = '../data/fasta_10.txt'  # path for the fasta file
out_path = '../data/myOutput.xml'  # path for the output file
evalue = 10.0
size = 3
blastLocal(fasta_path, out_path, evalue, size)

../ncbi/blast/bin/blastp -query ../data/fasta_10.txt -db ../ncbi/blast/bin/pdbaa -evalue 10.0 -outfmt 5 -out ../data/myOutput.xml : ran with exit code 0


In [None]:
class BlastHandler(object):
    def __init__:
        self.dictlist = {}