## DrugBank Parser test
Reference: https://github.com/dhimmel/drugbank/blob/gh-pages/parse.ipynb

In [27]:
import os
import csv
import gzip
import zipfile
import collections
import re
import io
import json
import xml.etree.ElementTree as ET
import time
import requests
import pandas
import pandas as pd
import pubchempy as pcp

In [2]:
print(os.getcwd())
os.chdir('../../')
print(os.getcwd())

'/home/tasnina/Projects/drug-synergy-prediction/code'

In [4]:
#set dataset path
dataset_dir = "datasets/"
processed_drug_feature_path = dataset_dir + 'processed/drug/'

xml_file = dataset_dir +"drug-bank/drugbank_all_full_database.xml.zip"
with zipfile.ZipFile(xml_file) as myzip:
    with myzip.open("full database.xml") as f:
        tree = ET.parse(f)
root = tree.getroot()

In [5]:
#parse raw drugbank file in a list
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + 1993
                                       "description")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    # extract some additional data
    row['indication'] = drug.findtext(ns + "indication")
    row['pharmacodynamics'] = drug.findtext(ns + "pharmacodynamics")
    row['mechanism-of-action'] = drug.findtext(ns + "mechanism-of-action")
    row['toxicity'] = drug.findtext(ns + "toxicity")
    row['protein-binding'] = drug.findtext(ns + "protein-binding")
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [6]:
alias_dict = {row['drugbank_id']: row['aliases'] for row in rows}
with open('./aliases.json', 'w') as fp:
    json.dump(alias_dict, fp, indent=2, sort_keys=True)

In [7]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [8]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
drugbank_df = pandas.DataFrame.from_dict(rows)[columns]
drugbank_df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00001,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,,Lepirudin is identical to natural hirudin exce...
1,DB00002,Cetuximab,biotech,approved,L01XC06,"Amino Acids, Peptides, and Proteins|Antibodies...",,,Cetuximab is an epidermal growth factor recept...
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,,Dornase alfa is a biosynthetic form of human d...
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,,A recombinant DNA-derived cytotoxic protein co...
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,,Dimeric fusion protein consisting of the extra...


In [9]:
# write drugbank tsv
durgbank_drug_info_filepath = os.path.join(processed_drug_feature_path, 'drugbank_drug_info.tsv')

os.makedirs(os.path.dirname(durgbank_drug_info_filepath), exist_ok=True)
drugbank_df.to_csv(durgbank_drug_info_filepath, sep='\t', index=False)

# write slim drugbank tsv
# path = os.path.join('data', 'drugbank-slim.tsv')
# drugbank_slim_df.to_csv(path, sep='\t', index=False)

## Extract protein information

In [10]:
force_run=False
drug_bank_drug_target_filepath = os.path.join(processed_drug_feature_path, 'drugbank-targets_info.tsv')


In [11]:
def extract_drugbanktarget_protein():
    #this function will map each drugs drugbank id to its target protein's uniprot id
    
    if(os.path.exists(drug_bank_drug_target_filepath) and force_run==False):
        print('drugbank parsed file for protein target already exists')
    else:
        protein_rows = list()
        for i, drug in enumerate(root):
            drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
            name= drug.findtext(ns + "name")
            for category in ['target', 'enzyme', 'carrier', 'transporter']:
                proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
                for protein in proteins:
                    row = {'drugbank_id': drugbank_id, 'name': name, 'category': category}
                    row['organism'] = protein.findtext('{}organism'.format(ns))
                    row['known_action'] = protein.findtext('{}known-action'.format(ns))
                    actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
                    row['actions'] = '|'.join(action.text for action in actions)
                    uniprot_ids = [polypep.text for polypep in protein.findall(
                        "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
                    if len(uniprot_ids) != 1:
                        continue
                    row['uniprot_id'] = uniprot_ids[0]
                    pmids = []
                    citations = []
                    references = protein.find('{http://www.drugbank.ca}references')
                    if references is not None:
                        for article in references.find('{http://www.drugbank.ca}articles'):
                            pmid = article.find('{http://www.drugbank.ca}pubmed-id').text
                            text = article.find('{http://www.drugbank.ca}citation').text
                            if pmid is not None:
                                pmids.append(pmid)
                                citations.append(text)
                    if len(pmids) > 0:
                        row['pubmed_ids'] = '|'.join(pmids)
                        row['citations'] = '|'.join(citations)

                    protein_rows.append(row)


        protein_df = pandas.DataFrame.from_dict(protein_rows)
        
        #filter out the non-human targets 
        protein_df = protein_df[protein_df['organism'].str.contains('Humans')]
        
        os.makedirs(os.path.dirname(drug_bank_drug_target_filepath), exist_ok=True)
        protein_df.to_csv(drug_bank_drug_target_filepath, sep='\t', index=False)
    
    protein_df = pandas.read_csv(drug_bank_drug_target_filepath, sep='\t', index_col=False)
#     print(protein_df.head())
    return protein_df

## DrugbankID to PubchemID mapping


In [13]:
force_run=False
drugbank_id_pubchem_id_mapping_file = os.path.join(processed_drug_feature_path, 'drugbank_id_pubchem_id.tsv')


In [14]:
# Read DrugBank compounds
drugbank_df = pandas.read_table(durgbank_drug_info_filepath)
drugbank_df = drugbank_df[-drugbank_df.inchi.isnull()]
drugbank_df = drugbank_df[['drugbank_id', 'inchi']]

In [20]:
def get_pubchem_cid_from_inchi(inchi):
    #this function retrieves a drug's 'pubchemid' from its 'inchi' using pubchempy package
    for i in range(0,5):
        try:
            compounds = pcp.get_compounds(inchi, 'inchi')    
            if(len(compounds)>0):
            #return the first compound from the search result
                print(compounds[0].cid )
                return compounds[0].cid
            else:
                print('None')
                return None
        except pcp.PubChemHTTPError:
            print('PubChemHTTPError')
            time.sleep(0.3)
            continue
    return None

In [21]:
def drugbank_id_pubchem_id_map(drugbank_df):
    # map DrugBank compounds to pubchem using InChI 
    if(os.path.exists(drugbank_id_pubchem_id_mapping_file) and force_run==False):
        print('drugbank_id_pubchem_id_mapping already exists, reading from there')
    else:
        drugbank_df['pubchem_cid'] = drugbank_df['inchi'].apply(lambda x: get_pubchem_cid_from_inchi(x))
        drugbank_df.dropna(inplace=True)
        
        os.makedirs(os.path.dirname(drugbank_id_pubchem_id_mapping_file), exist_ok=True)
        drugbank_df.to_csv(drugbank_id_pubchem_id_mapping_file, index = False, sep='\t')
    drugbank_df = pd.read_csv(drugbank_id_pubchem_id_mapping_file,  index_col = 'drugbank_id',sep='\t')
    return drugbank_df


In [28]:
#add pubchem id column as pubchem_cid protein_df where drugbank_drug to target protein(Uniprot ID) (continued)
#mapping is already present.
protein_df =  extract_drugbanktarget_protein()
drugbankid_pubchemid_mapping_df = drugbank_id_pubchem_id_map(drugbank_df.copy())
print(drugbankid_pubchemid_mapping_df.head())
protein_df['pubchem_cid'] = protein_df['drugbank_id'].astype(str).apply(lambda x:\
                                            drugbankid_pubchemid_mapping_df.at[x , 'pubchem_cid']\
                                            if x in drugbankid_pubchemid_mapping_df.index else 'NaN')

protein_df = protein_df[protein_df['pubchem_cid'] != 'NaN' ]
print(protein_df.columns)
print(protein_df.head())

101041682
657181
5311128
None
5311065
25074887
11979316
101589918
5284373
14257662
448601
16131215
1051
None
135444742
6274
34755
1060
6140
171548
305
5962
6322
54670067
1103
5960
6262
5961
6083
5280934
5951
6137
6057
5280453
5281243
67678
1110
493570
1738118
33032
124886
6323481
750
5283731
1050
586
6106
6305
5862
1130
5280793
5280581
9750
6288
439153
135398658
446284
5950
6287
445354
14985
1054
6112
6306
134601
5280795
4055
5957
145742
190
6267
54687
5361192
60846
5362129
71398
82153
2284
3007
9853654
89594
25137844
3958
59768
387447
5281077
34359
4771
52195
33741
21704
2369
3365
5591
65028
12560
146678237
2519
5314
135398744
71329
4993
5770
447043
5472
5284632
60164
4195
6324659
4679
41781
2771
77993
2368
152946
5749
64143
3762
3476
71415
5362440
101673418
38521
53232
3226
43708
5486971
5391
4121
4649
127151
4197
4842
2479
4463
5361463
5311000
2481
20279
56959
4075
1201549
60854
6540428
54746
5905
2955
441383
1775
247839
54671203
448537
54707177
2812
6116
5333
6234
135409400
2578
53

17925
5463858
101819
14708
448223
31016
62516
62751
68947
62710
62528
10634
91460
136297
25223
85875
92943
15818
5362456
62307
6833
19094
62156
6128
98527
5463874
62529
69249407
13308
32938
10039774
3380
3299
8367
5463863
443408
62436
61810
5284543
12218309
2816
9835303
22308
10007
62281
2441
2811
62258
222865
60977
2707
28204
5359421
105039
3369
5359929
17150
62279
61996
7092657
5362507
73415752
92024769
5826
10836
20056604
30487
4631
5325
5327
138402339
31401
33746
4890
4999
6442177
154059
23925
23994
3033621
4506
6435415
104838
4912
5468
92727
441397
6087
33478
115163
123630
36921
4747
214348
135413535
3652
10026
5284553
6077
26035
3678
23897
11291
4761
62867
9429
941651
5311507
3775
4688
3000540
123619
9427
5288596
15606
7339
445968
17754117
676443
444637
5780
5168640
5287540
139031101
445238
9700
449159
5280961
448580
449190
447600
135415790
637540
445936
168718
446821
449193
23644144
None
17754134
16019963
17754112
None
447195
4469371
65302
135403646
6323454
5459319
7997
10161010

5288266
61207
446858
5988
2084
10442
14253
445355
445555
8153
87087973
57484478
447742
49866981
444756
448993
49867319
447765
8955
449409
446082
5460580
135488885
7478
135483765
1677
445093
5288792
135403828
445905
446958
446127
447923
6915836
8019
5289313
449547
None
117765
6327654
5287596
87901
None
5288206
447414
447196
446491
131704220
441443
70119
546
445062
6369389
446871
131153
446325
447546
1003
None
447961
448658
82400
135460989
446807
440046
1491
656974
5288987
150238
65533
54325748
6369390
93176
445278
5740
4324
5326813
1714210
5282253
8988
5880
5287408
7752
135398635
1475
5281897
5311365
446242
5288604
3481670
448057
65077
644175
157684
54560057
68525
9543528
5288631
4630909
447502
6399008
5289501
5289108
447561
5496910
448797
6323416
446696
712421
4635111
65355
446414
449164
161647
39040
445478
84815
445393
9543424
9570
186
445966
17753927
447096
10635
10214
135678100
17753872
446493
5287884
6120722
447537
784625
445344
445109
92753
5287678
447958
97576
449096
5723
447229


447411
None
5287870
5289266
446658
447022
17754068
439507
131704265
65309
14233317
12587
700
444041
5288685
439398
49866821
76839
167842
444763
1759
446300
135445363
6133
5289179
2429
None
None
1511
448202
446411
444456
5005498
6398483
656932
4380
445319
9957008
5288799
135398637
135398576
448726
449293
71070
448819
445694
4470577
None
448333
5388928
439190
6323246
174251
447980
444679
4476947
5287447
131704267
17753826
None
657140
449534
94214
135440058
1535935
448487
79084
16757
7249
183145
448974
1478
135451908
446054
448610
17754095
12192007
439650
446181
5289026
5287890
637542
446969
444503
5288116
121904
444170
5459784
447976
49
70914
135398638
444397
75791
5288859
4629337
135412779
2758978
84836
445981
5288577
10305301
4369103
394347
447280
8186
111509
135488881
447325
448872
44308490
165491
49874333
447957
447594
136775370
135398661
347402
46936889
5805259
5287684
5289339
8391
16131414
5288635
17429
5288588
2353
6454902
445461
49866868
131704268
9958
135415794
439427
6327749
13

9867758
6918218
47576
100427
11561674
53497429
6450818
636427
3089902
6918446
9829419
5310985
10102486
9939865
9871747
300471
None
10313457
9886917
9842252
5318517
56841552
100094
148189
5311449
9873
6439330
3038505
69532
12594
68470595
148202
132971
9837656
3081349
192711
9838899
9830969
71587684
3037115
3038506
5310967
54682876
6918653
9853710
12041
3000926
163829
10409873
9942725
6918835
9868037
135398510
11661758
11270783
9901946
11159621
11703255
42642648
72941990
11483754
513956
11658655
9874874
11455973
3025986
135494187
5281232
400769
11656518
447715
9949641
23653584
11977989
25058411
11154555
11700696
44224257
11318905
16666708
44195571
11640390
21302490
11234794
6439522
56842195
25147644
11984561
131704298
135484546
3057996
9547261
11962412
None
11688894
6477182
24995524
23635314
10073773
60149
6440717
5340
4184
5323
12035
4567
27400
6518
104850
11370864
24951021
472335
124093
135430970
44224276
2929
275196
5352062
123628
130731
11683005
6918572
135409409
9927346
6851740
9887

46937065
5287652
14368760
11840917
448295
9819610
10411846
24880024
15602701
444607
448912
2724883
444238
46937066
199994
17749728
46937067
446967
13472
253602
440114
1792
448285
448297
6914609
124969
445144
42627755
445824
5287692
5287703
4290313
11149707
10221335
4369084
6852125
2256
11505541
444853
16070039
12148754
25174099
673481
444224
3034285
46937068
16741245
16086422
9886249
16741237
776315
736027
16122554
16122553
46937069
263366
12000133
135874085
42628075
448271
255
24892822
24748047
46937071
10174090
9824562
53320410
9862248
16122556
44129642
46937073
1121782
44631903
45273683
87762754
5287737
46937074
5287739
446532
54929
5287741
324690
446965
54727972
6540255
135781631
25271580
11302
25138289
46937075
447950
445332
135566394
11790
5287785
22489160
448943
2398
448642
2352168
9934347
5287789
6914595
11393519
5287794
11235729
5287796
12598931
24178119
24178120
24178121
11957417
444703
444704
135530418
462919
3332
7075
5287834
1757
135566413
10198228
9543498
44176354
232446


11963565
24832027
12229396
5289112
7020128
9547984
115069
4369560
139033481
23629653
446756
46937145
24875319
11516136
1122713
44141871
24892821
11610553
11581936
657061
15991588
54688400
11955614
2059510
1587957
449244
4369443
135431471
1854
1530
667639
25243828
49867507
5289162
194777
52945396
52942978
446744
6857724
25011727
449242
449243
5496653
449245
446757
446309
9804302
9912519
46937149
5289192
5289197
23586040
5327066
5327067
445481
1028
13949633
42608447
656902
52940551
10019998
564920
5289221
23654841
448968
439530
54736482
119828
18719622
9952008
25058141
11160307
25102779
24180585
24886843
23640750
46937152
10150441
9998128
151506
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
6419766
6438583
131704309
5327076
445987
16045340
16045418
16750071
10401956
10224714
11373270
11840979
185914
5289278
18690028
12195517
1453694
1548970
5894
5289290
5289293
446676
425841
2730282
5494443
136711420
5487313
24856357
16058682
122335
447359
447360
47

54693473
3830
449093
9962735
9989226
142154
165675
444294
5357696
24456
42956
460
9913767
49806720
65944
5199
54676478
5284529
15907
104845
10041070
36324
71674
15443
6433554
21651
31343
33309
8480
5287879
21743
5977
6212
43231
2871
8293
71335
29112
3058754
3037
3039
456389
9917862
9832750
135434844
6918632
71188
6450531
5859
135449328
3334
72734332
3346
114811
59364992
145994607
9454
8089
86418
5360807
71777
6441712
60651
204108
68602
251636
441145
16760141
65452
26383
6436061
66826
3034287
9820838
72493
60605
40854
7108
12901
8573
4823
24751
3050408
9802884
6436031
6433107
9298
6758
5104
6445091
638072
6634
13755
5334
5338
59757
101290202
656958
24860548
5282521
72157
5853
9832301
5280440
127053480
5707
2999413
3086576
3092
71310
5484735
16752
5284595
56208
24455
1711973
697993
19395
91663250
13986
2480
6472
5311053
3032611
25077495
56206
439369
10831
10579
71823
38081
5311100
56928061
21389
127516
68595
10074640
9906614
66252
8005
157385
24352
56052
71969
3085092
71327
8310
26533
25

9838712
42613186
104762
2214
6918494
10220323
259846
18376177
51167
25022354
24996872
16213540
160565
29626
9811551
46208367
6918461
10224267
5311505
9549213
46208720
9878913
89497391
9895468
16041426
9804992
9929901
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
9826744
90274764
400010
24946920
25027363
51352628
10407120
89508529
58310140
5639
9888484
59745
10192617
5280804
6433109
666418
443363
44467821
11948288
6918502
5318980
119026214
57327016
53354764
135398645
6918315
44251769
11210478
11256720
5469318
6444692
9908900
46191454
46241268
11180124
25118925
9833519
55362
5074
52949124
16741
10409068
147004
73611
11842633
25167777
55652
49871973
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
PubChemHTTPError
6918848
5312153
6050
4744
25242323
4820
24831714
3081185
11687907
2815031
9889366
10026128
11633038
59657596
57339445
11961293
9554809
91694
25134326
9810929
6914657
9935681
44557636
3276
71354
58298318
57413968
11282283


6097185
None
24987688
11824
4382
39985
2541
6918726
500902
None
44608289
None
60943
11719003
66599893
44137686
44512434
11703306
25195253
57398987
44200882
10264356
122262
441404
9416
65157
76971566
90489020
29936
5311501
11809740
9818306
222757
9403
165688
13791
91885410
10246085
2450
154911
7239
8042
101272215
61906
62582
23990
876
11430
5113032
91668
161046
9818878
6351
None
5362485
6604889
15671
21854223
71219
71105
22609888
23581869
145994608
8896
124081896
8925
20353
86472
54686350
3053
5281628
1832
9852188
57394020
57394021
23665763
92135969
11808
71544786
8400
60063
5443
None
2997
44404226
121596705
5799
6662
60870
6918151
73412
222284
4671
28281
92850
134694335
943
11601669
132282518
132282519
119034
54688261
34230
3000706
51351746
9865808
4553
3634
6438151
65808
71225
3949
73078
213040
6918097
6450807
11399764
114743
9939298
5282768
125098
38258
11177
16124
92141
None
11954171
185457
2286
7184
23973
9064
6441454
9354
6213
None
91667979
2734086
76967405
6438744
None
65146
8903

Index(['drugbank_id', 'name', 'category', 'organism', 'known_action',
       'actions', 'uniprot_id', 'pubmed_ids', 'citations', 'pubchem_cid'],
      dtype='object')
   drugbank_id         name category organism known_action    actions  \
30     DB00006  Bivalirudin   target   Humans          yes  inhibitor   
31     DB00006  Bivalirudin   enzyme   Humans      unknown  inhibitor   
32     DB00007   Leuprolide   target   Humans          yes    agonist   
33     DB00007   Leuprolide   enzyme   Humans           no  substrate   
55     DB00014    Goserelin   target   Humans          yes    agonist   

   uniprot_id                                         pubmed_ids  \
30     P00734  11060732|11504570|11833835|11923794|11929334|1...   
31     P05164                                           18701766   
32     P30968        16809153|10687850|9625809|15758569|10394541   
33     P08684                                           17922273   
55     P22888                         17139284|1701642

In [34]:
drugbank_drug_target_final_mapping_file = os.path.join(processed_drug_feature_path, 'drug_target_map_drugbank.tsv')
drug_target_df = protein_df[['pubchem_cid','name','uniprot_id']].rename(columns={'name': 'drug_name'})
drug_target_df.dropna(inplace= True)

drug_target_df ['pubchem_cid'] = drug_target_df ['pubchem_cid'].astype(int).apply(lambda x: str(x))
print(drug_target_df.head())

#write to file

os.makedirs(os.path.dirname(drugbank_drug_target_final_mapping_file), exist_ok=True)
drug_target_df.to_csv(drugbank_drug_target_final_mapping_file, sep = '\t', index=False)

   pubchem_cid    drug_name uniprot_id
30   101041682  Bivalirudin     P00734
31   101041682  Bivalirudin     P05164
32      657181   Leuprolide     P30968
33      657181   Leuprolide     P08684
55     5311128    Goserelin     P22888


In [35]:
drug_target_df = pd.read_csv(drugbank_drug_target_final_mapping_file, sep = '\t')

print(drug_target_df.head())

   pubchem_cid    drug_name uniprot_id
0    101041682  Bivalirudin     P00734
1    101041682  Bivalirudin     P05164
2       657181   Leuprolide     P30968
3       657181   Leuprolide     P08684
4      5311128    Goserelin     P22888
