In [1]:
import pandas as pd
import numpy as np
import os
from lxml import etree
import xml.etree.ElementTree as ET

In [18]:
aatranlation = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
                'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
                'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
                'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'}

In [2]:
def readHumanGenes(path):
    human_genes = []
    with open(path, 'r') as filehandle:
        human_genes = filehandle.read().splitlines()
    return human_genes

In [3]:
human_genes = readHumanGenes('../data/UniProtHumanEnzymeGenes.txt')

In [40]:
class variationHandler(object):
    def __init__(self, enzyme_genes):
        self.dictlist = {'interpretation': [], 'gene':[], 'gene_name':[], 'accession':[], 'mutation': [], 'NP': [], 'Chr': [], 'start':[], 'stop':[], 'referenceAllele':[], 'alternateAllele':[], 'FASTA':[], 'PDB': []}
        self.enzyme_genes = enzyme_genes
        self.gene = ""
        self.gene_name = ""
        self.interpretation = ""
        self.accession = ""
        self.mutation = ""
        self.np_num = ""
        self.change = ""
        self.chr = ""
        self.start_num = ""
        self.stop_num = ""
        self.referenceAllele = ""
        self.alternateAllele = ""
        self.is_GeneList = False
        self.ct_gene = 0
        self.check_grch = False
        self.ct_np = 0
        self.ct_mc = 0  # counter for Molecular Consequence tag
        self.is_haplotype = False  # check if a variation is haplotype or not
        self.is_missense = False
        self.is_conflicting = False
        self.is_not_provided = False
        self.is_interpretations = False
        self.is_interpretation = False
        self.is_description = False
        self.is_desc_hist = False
        self.intpn = []
        self.ct = 0
        self.ct_missense = 0
        self.ct_uncertain = 0
        self.ct_conflicting = 0
        self.ct_not_provided = 0
        
    def start(self, tag, attrs):
        if tag == 'VariationArchive':
            self.accession = attrs.get('Accession')
        elif tag == 'Haplotype':
            self.is_haplotype = True
        elif tag == 'GeneList':
            self.is_GeneList = True
        elif tag == 'Gene' and self.ct_gene == 0:
            self.gene = attrs.get('Symbol')
            self.gene_name = attrs.get('FullName')
            self.ct_gene += 1
        elif tag == 'SequenceLocation' and self.is_GeneList == False and self.check_grch == False:
            if attrs.get('Assembly') == 'GRCh38':
                self.chr = attrs.get('Chr')
                self.start_num = attrs.get('start')
                self.stop_num = attrs.get('stop')
                self.referenceAllele = attrs.get('referenceAlleleVCF')
                self.alternateAllele = attrs.get('alternateAlleleVCF')
                self.check_grch = True
        elif tag == 'ProteinExpression' and self.ct_np == 0:
            self.np_num = attrs.get('sequenceAccessionVersion')
            self.change = attrs.get('change') 
            if self.np_num and self.np_num.startswith('NP'):            
                self.ct_np += 1
        elif tag == 'MolecularConsequence' and self.ct_mc == 0:
            if attrs.get('Type') and 'missense' in attrs.get('Type').lower():
                self.is_missense = True
                self.ct_missense += 1
            self.ct_mc += 1
#         elif tag == 'RCVAccession':
#             self.interpretation = attrs.get('Interpretation').lower()
#             if "uncertain" in self.interpretation:
#                 self.is_uncertain = True
#                 self.ct_uncertain += 1
#             elif "conflicting" in self.interpretation:
#                 self.is_conflicting = True
#                 self.ct_conflicting += 1
#             elif "not provided" in self.interpretation:
#                 self.is_not_provided = True
#                 self.ct_not_provided += 1
        elif tag == 'Interpretations':
            self.is_interpretations = True
        elif tag == 'Interpretation':
            self.is_interpretation = True
        elif tag == 'Description':
            self.is_description = True
        elif tag == 'DescriptionHistory':
            self.is_desc_hist = True
            
    def end(self, tag):
        if tag == 'VariationArchive' or (self.is_haplotype and tag == 'SimpleAllele'):
            if (self.gene in self.enzyme_genes) and self.is_missense and (self.is_uncertain or self.is_conflicting or self.is_not_provided):
                try:
                    self.change = self.change.split('p.')[1]
                    before = aatranlation.get(self.change[0:3])
                    after = aatranlation.get(self.change[len(self.change) - 3:len(self.change)])
                except: 
                    before = None
                    after = None
                if before and after:  # check if both have a value in aa dict
                    num = self.change[3:len(self.change) - 3]
                    abbreviated_change = before + num + after
                    fasta = np.nan
                    self.dictlist['interpretation'].append(self.interpretation)
                    self.dictlist['gene'].append(self.gene)
                    self.dictlist['gene_name'].append(self.gene_name)
                    self.dictlist['accession'].append(self.accession)
                    self.dictlist['mutation'].append(abbreviated_change)
                    self.dictlist['NP'].append(self.np_num)
                    self.dictlist['Chr'].append(self.chr)
                    self.dictlist['start'].append(self.start_num)
                    self.dictlist['stop'].append(self.stop_num)
                    self.dictlist['referenceAllele'].append(self.referenceAllele)
                    self.dictlist['alternateAllele'].append(self.alternateAllele)
                    self.dictlist['FASTA'].append(fasta)
                    self.dictlist['PDB'].append(np.nan)
            if len(self.intpn) > 1:
                print(f'Interpretaion: {self.intpn}, Accession: {self.accession}, count: {self.ct}')
            if len(self.intpn) > 0 and 'conflicting' in self.intpn[0].lower():
                self.ct_conflicting += 1
#                print(f'Interpretaion: {self.intpn[0]}, Acdession: {self.accession}, count: {self.ct}')
            self.ct_gene = 0             
            self.check_grch = False
            self.is_missense = False
            self.is_uncertain = False
            self.is_conflicting = False
            self.is_not_provided = False
            self.ct_np = 0
            self.ct_mc = 0
            if tag == 'VariationArchive':
                self.ct +=1
                self.is_description = False
                if self.ct % 10000 == 0:
                    print(self.ct)
            self.intpn = []
        elif tag == 'GeneList':
            self.is_GeneList = False
        elif tag == 'Interpretations':
            self.is_interpretations = False
        elif tag == 'Interpretation':
            self.is_interpretaion = False
        elif tag == 'Description':
            self.is_description = False
        elif tag == 'DescriptionHistory':
            self.is_desc_hist = False
                
    def data(self, data):
        if self.is_interpretations and self.is_interpretation and self.is_description and (not self.is_desc_hist):
            self.intpn.append(data)
            
    def close(self):
        print(f"Variations: {self.ct}")
        print(f"Uncertain Significance: {self.ct_uncertain}")
        print(f"Conflicting Report: {self.ct_conflicting}")
        print(f"Missense: {self.ct_missense}")
        print('debug: the file is closed')
        return self.dictlist

In [28]:
# read xml file of variations from ClinVar
# return dataframe and write to a csv file
def readClinVarVariationsXML(input_path, output_path, gene_set):
    print('debug: start parcing')
    parser = etree.XMLParser(target=variationHandler(gene_set))
    data = etree.parse(input_path, parser)
    df = pd.DataFrame(data)
    df.to_csv(output_path, index = False, header = True)
    return df

In [29]:
xmlfile = '../data/ClinVarVariationRelease_00-latest_weekly.xml'
out_path = '../data/MM_enzyme.csv'
readClinVarVariationsXML(xmlfile, out_path, human_genes)

debug: start parcing
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
Interpretaion: ['Pathogenic', 'Pathogenic'], Acdession: VCV000424711, count: 523053
530000
540000
550000
560000
Interpretaion: ['Pathogenic', 'Pathogenic'], Acdession: VCV000424757, count: 561125
Interpretaion: ['no interpretation for the single variant', 'no interpretation for the single variant'], Acdession: VCV000431012, count: 562783
Interpretaion: ['no interpretation for the single variant', 'Pathogenic'], Acdession: VCV000431013, count: 562784
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
Interpretaion: ['Uncertain significance', 'Pathogenic'], Acdession: VCV000549768, count: 6728

Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000633936, count: 679259
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000633936, count: 679259
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000633937, count: 679260
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000633937, count: 679260
Interpretaion: ['drug response', 'Benign'], Acdession: VCV000633938, count: 679261
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000633938, count: 679261
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000633939, count: 679262
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000633939, count: 679262
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000633940, count: 679263
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000633941, count: 679264

Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634010, count: 679333
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634011, count: 679334
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634011, count: 679334
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634012, count: 679335
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634012, count: 679335
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634013, count: 679336
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634015, count: 679338
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634017, count: 679340
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634018, count: 679341
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634019, count: 679342
Interpretaion: ['drug response', 'drug respons

Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634096, count: 679419
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634096, count: 679419
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634098, count: 679420
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634098, count: 679420
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634099, count: 679421
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634099, count: 679421
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634100, count: 679422
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634100, count: 679422
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634101, count: 679423
Interpretaion: ['drug response', 'drug response'], Acde

Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634170, count: 679491
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634171, count: 679492
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634172, count: 679493
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634173, count: 679494
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634174, count: 679495
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634175, count: 679496
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634176, count: 679497
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634177, count: 679498
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634178, count: 679499
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634179, count: 679500
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634180, count: 679501
Interpreta

Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634256, count: 679577
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634256, count: 679577
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634257, count: 679578
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634257, count: 679578
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634258, count: 679579
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634258, count: 679579
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634259, count: 679580
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634259, count: 679580
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634260, count: 679581
Interpretaion: ['drug response', 'drug response'], Acde

Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634337, count: 679658
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634338, count: 679659
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634338, count: 679659
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634339, count: 679660
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634339, count: 679660
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634340, count: 679661
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634340, count: 679661
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acdession: VCV000634341, count: 679662
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634341, count: 679662
Interpretaion: ['drug response', 'no interpretation for the single variant'], Acde

Interpretaion: ['drug response', 'drug response'], Acdession: VCV000634427, count: 680079
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638205, count: 681323
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638206, count: 681324
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638207, count: 681325
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638783, count: 681334
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638784, count: 681335
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638785, count: 681336
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638786, count: 681337
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638787, count: 681338
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638788, count: 681339
Interpretaion: ['drug response', 'drug response'], Acdession: VCV000638789, count: 681340
Interpreta

Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele,FASTA,PDB


In [64]:
class variationHandlerSpecific(object):
    def __init__(self, accession):
        self.is_accession = False
        self.accession = accession
        self.ct = 0
        self.ct_all = 0
        print(self.accession)
        
    def start(self, tag, attrs):
        if (tag == 'VariationArchive') and (attrs.get('Accession') == self.accession):
            self.is_accession = True
        if self.is_accession:
            print(f"{self.ct} tag: {tag}")
            self.ct += 1
            
    def end(self, tag):
        if self.is_accession:
            print(f"{self.ct} tag: /{tag}")
        if tag == 'VariationArchive':
            self.is_accession = False
            self.ct = 0
            if self.ct_all % 10000 == 0:
                print(self.ct_all)
            self.ct_all += 1
            
    def data(self, data):
        if self.is_accession and (data.strip()):
            print(f"    {self.ct} data: {data}")
            
    def close(self):
        print('debug: the xml file is closed')

In [65]:
# read xml file of variations from ClinVar
# return dataframe and write to a csv file
def readClinVarVariationsXMLSpecific(input_path, accession):
    print('debug: start parcing')
    parser = etree.XMLParser(target=variationHandlerSpecific(accession))
    data = etree.parse(input_path, parser)
    df = pd.DataFrame(data)
    return df

In [66]:
xmlfile = '../data/ClinVarVariationRelease_00-latest_weekly.xml'
accession = 'VCV000424711'
readClinVarVariationsXMLSpecific(xmlfile, accession)

debug: start parcing
VCV000424711
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
0 tag: VariationArchive
VCV000424711 and VCV000424711: True
1 tag: RecordStatus
VCV000424711 and None: False
    2 data: current
2 tag: /RecordStatus
2 tag: Species
VCV000424711 and None: False
    3 data: Homo sapiens
3 tag: /Species
3 tag: InterpretedRecord
VCV000424711 and None: False
4 tag: Genotype
VCV000424711 and None: False
5 tag: SimpleAllele
VCV000424711 and None: False
6 tag: GeneList
VCV000424711 and None: False
7 tag: Gene
VCV000424711 and None: False
8 tag: Location
VCV000424711 and None: False
9 tag: CytogeneticLocation
VCV000424711 and None: False
    10 data: 12q13.3
10 tag: /Cytogenet

530000
debug: the xml file is closed


KeyboardInterrupt: 

In [17]:
tree = etree.parse('../data/clinvarVariation_7.xml')
root = tree.getroot()
node = tree.find('VariationArchive[@Accession="VCV000012722"]')
mydata = etree.tostring(node)
myfile = open('../data/subnode.xml', 'wb')
myfile.write(mydata)

42977

In [4]:
# makes dictionary of fasta sequences and np number 
# returns the dictionary
def makeDictOfFasta(dictpath):
    fasta_dict = {}
    for root, d_names, file_names in os.walk(dictpath):
        for filename in file_names:
            fname = os.path.join(root, filename)
            with open(fname) as f:
                print('opened a fasta file')
                np_num = ''
                sequence = ''
                for line in f:
                    if line[0] == '>':
                        if sequence != '':
                            fasta_dict[np_num] = sequence
                            np_num = ''
                            sequence = ''                    
                        i = 1
                        while line[i] != ' ':
                            np_num += line[i]
                            i += 1
                    else:
                        line = line.strip('\n')
                        sequence += line
    print(f'length of fasta dictionary: {len(fasta_dict)}')
    return fasta_dict

In [18]:
df_0 = pd.read_csv('../data/MM_enzyme.csv')
df_0.head()

Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele,FASTA,PDB
0,uncertain significance,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,VCV000000031,R97C,NP_612422.2,10,97598852,97598852,C,T,SERLEVVSRVRQAMPKNRLLL,
1,uncertain significance,AGA,aspartylglucosaminidase,VCV000000222,G60D,NP_000018.2,4,177440375,177440375,C,T,GGSALDAVESGCAMCEREQCD,
2,uncertain significance,DPYD,dihydropyrimidine dehydrogenase,VCV000000437,R886H,NP_000101.2,1,97098598,97098598,C,T,LPSFGPYLEQRKKIIAENKIR,
3,uncertain significance,PTS,6-pyruvoyltetrahydropterin synthase,VCV000000477,R16C,NP_000308.1,11,112226489,112226489,C,T,GGRRCQAQVSRRISFSASHRL,
4,uncertain significance,PROC,"protein C, inactivator of coagulation factors ...",VCV000000661,P210L,NP_000303.1,2,127426178,127426178,C,T,DTEDQEDQVDPRLIDGKMTRR,


In [14]:
fasta_dict = makeDictOfFasta('../fasta_sequences/')

opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
opened a fasta file
length of fasta dictionary: 113602


In [22]:
# crops fasta sequence
# returns the cropped sequnece with a specified range
def cropFASTA(sequence, location, reference, seqRange):
    if location - 1 < len(sequence) and sequence[location - 1] == reference:
        proteinSeq = sequence[0 if location - 1 - seqRange <= 0 else location - 1 - seqRange : location + seqRange]
        return proteinSeq
    else:
        return None

In [39]:
def addFASTAfromDict(fasta_dict, df):
    none_acc = []
    seq_list = []
    for index, row in df.iterrows():
        mutation = row['mutation']
        try:
            ref = mutation[0]
            location = int(mutation[1:len(mutation)-1])
            np_num = row['NP']  # specify the column of np 
            sequence = fasta_dict.get(np_num)
            seqRange = 10  # range of sequences to take
            seq = cropFASTA(sequence, location, ref, seqRange) if sequence else None
        except:
            seq = None
            accession = row['accession']
            none_acc.append(accession)
        seq_list.append(seq)
    df['FASTA'] = seq_list
    print(f'Unfound Sequences: {len(none_acc)} {none_acc}')
    return df

In [40]:
df_1 = addFASTAfromDict(fasta_dict, df_0)
df_1.head()

Unfound Sequences: 38 ['VCV000181879', 'VCV000288381', 'VCV000418418', 'VCV000428273', 'VCV000474413', 'VCV000484704', 'VCV000490097', 'VCV000503859', 'VCV000531064', 'VCV000541685', 'VCV000582159', 'VCV000583086', 'VCV000589537', 'VCV000591262', 'VCV000591672', 'VCV000591831', 'VCV000594518', 'VCV000598475', 'VCV000524323', 'VCV000437115', 'VCV000216871', 'VCV000418344', 'VCV000464734', 'VCV000481269', 'VCV000566152', 'VCV000800863', 'VCV000462579', 'VCV000537105', 'VCV000572650', 'VCV000625941', 'VCV000630885', 'VCV000650338', 'VCV000650666', 'VCV000658054', 'VCV000664737', 'VCV000240738', 'VCV000411042', 'VCV000412448']


Unnamed: 0,interpretation,gene,gene_name,accession,mutation,NP,Chr,start,stop,referenceAllele,alternateAllele,FASTA,PDB
0,uncertain significance,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,VCV000000031,R97C,NP_612422.2,10,97598852,97598852,C,T,SERLEVVSRVRQAMPKNRLLL,
1,uncertain significance,AGA,aspartylglucosaminidase,VCV000000222,G60D,NP_000018.2,4,177440375,177440375,C,T,GGSALDAVESGCAMCEREQCD,
2,uncertain significance,DPYD,dihydropyrimidine dehydrogenase,VCV000000437,R886H,NP_000101.2,1,97098598,97098598,C,T,LPSFGPYLEQRKKIIAENKIR,
3,uncertain significance,PTS,6-pyruvoyltetrahydropterin synthase,VCV000000477,R16C,NP_000308.1,11,112226489,112226489,C,T,GGRRCQAQVSRRISFSASHRL,
4,uncertain significance,PROC,"protein C, inactivator of coagulation factors ...",VCV000000661,P210L,NP_000303.1,2,127426178,127426178,C,T,DTEDQEDQVDPRLIDGKMTRR,


In [43]:
df_1[['NP', 'gene', 'gene_name', 'FASTA']].head()

Unnamed: 0,NP,gene,gene_name,FASTA
0,NP_612422.2,HOGA1,4-hydroxy-2-oxoglutarate aldolase 1,SERLEVVSRVRQAMPKNRLLL
1,NP_000018.2,AGA,aspartylglucosaminidase,GGSALDAVESGCAMCEREQCD
2,NP_000101.2,DPYD,dihydropyrimidine dehydrogenase,LPSFGPYLEQRKKIIAENKIR
3,NP_000308.1,PTS,6-pyruvoyltetrahydropterin synthase,GGRRCQAQVSRRISFSASHRL
4,NP_000303.1,PROC,"protein C, inactivator of coagulation factors ...",DTEDQEDQVDPRLIDGKMTRR


In [60]:
def makeFASTAfile(df, output_path):
    subset = df[['NP', 'gene', 'gene_name', 'FASTA']]
    tuples = [tuple(x) for x in subset.values]
    with open(output_path, 'w') as f:
        for tup in tuples:
            line = '>' + '\t'.join(tup[0:3]) + '\n'
            fasta = str(tup[3]) + '\n'
            f.write(line)
            f.write(fasta) 

In [61]:
makeFASTAfile(df_1, '../data/fasta.txt')