## DrugBank Parser test
https://github.com/dhimmel/drugbank/blob/gh-pages/parse.ipynb

In [1]:
import os
import csv
import gzip
import zipfile
import collections
import re
import io
import json
import xml.etree.ElementTree as ET

import requests
import pandas
import pandas as pd

In [2]:
os.getcwd()

'/data/jeff-law/projects/2020-03-covid-19/SARS-CoV-2-network-analysis/src/jupyter-notebooks'

In [3]:
xml_file = "/data/jeff-law/projects/2020-03-covid-19/SARS-CoV-2-network-analysis/datasets/drug-targets/drugbank-v5.1.6/additional-data/drugbank_all_full_database.xml.zip"
with zipfile.ZipFile(xml_file) as myzip:
    with myzip.open("full database.xml") as f:
        tree = ET.parse(f)
root = tree.getroot()

In [4]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    # extract some additional data
    row['indication'] = drug.findtext(ns + "indication")
    row['pharmacodynamics'] = drug.findtext(ns + "pharmacodynamics")
    row['mechanism-of-action'] = drug.findtext(ns + "mechanism-of-action")
    row['toxicity'] = drug.findtext(ns + "toxicity")
    row['protein-binding'] = drug.findtext(ns + "protein-binding")
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [5]:
alias_dict = {row['drugbank_id']: row['aliases'] for row in rows}
with open('./aliases.json', 'w') as fp:
    json.dump(alias_dict, fp, indent=2, sort_keys=True)

In [6]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [7]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
drugbank_df = pandas.DataFrame.from_dict(rows)[columns]
drugbank_df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00001,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,,Lepirudin is identical to natural hirudin exce...
1,DB00002,Cetuximab,biotech,approved,L01XC06,"Amino Acids, Peptides, and Proteins|Antibodies...",,,Cetuximab is an epidermal growth factor recept...
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,,Dornase alfa is a biosynthetic form of human d...
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,,A recombinant DNA-derived cytotoxic protein co...
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,,Dimeric fusion protein consisting of the extra...


In [8]:
drugbank_slim_df = drugbank_df[
    drugbank_df.groups.map(lambda x: 'approved' in x) &
    drugbank_df.inchi.map(lambda x: x is not None) &
    drugbank_df.type.map(lambda x: x == 'small molecule')
]
drugbank_slim_df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
5,DB00006,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,Bivalirudin is a synthetic 20 residue peptide ...
6,DB00007,Leuprolide,small molecule,approved|investigational,L02AE51|L02AE02,Adrenal Cortex Hormones|Agents Causing Muscle ...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,Leuprolide belongs to the general class of dru...
13,DB00014,Goserelin,small molecule,approved,L02AE03,"Adrenal Cortex Hormones|Amino Acids, Peptides,...",BLCLNMBMMGCOAS-URPVMXJPSA-N,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,"Goserelin is a synthetic hormone. In men, it s..."
25,DB00027,Gramicidin D,small molecule,approved,R02AB30,"Amino Acids, Peptides, and Proteins|Anti-Bacte...",NDAYQJDHGXTBJL-MWWSRJDJSA-N,InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(11...,Gramcidin D is a heterogeneous mixture of thre...
33,DB00035,Desmopressin,small molecule,approved,H01BA02,"Agents that produce hypertension|Amino Acids, ...",NFLWUMRGJYTJIN-PNIOQBSNSA-N,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...,"Desmopressin (dDAVP), a synthetic analogue of ..."


In [9]:
# write drugbank tsv
path = os.path.join('data', 'drugbank.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)

# write slim drugbank tsv
path = os.path.join('data', 'drugbank-slim.tsv')
drugbank_slim_df.to_csv(path, sep='\t', index=False)

In [10]:
# also get the extra information per drug
columns = ['drugbank_id', 'name', 'indication', 'pharmacodynamics', 'mechanism-of-action', 'toxicity', 'protein-binding']
drugbank_df2 = pandas.DataFrame.from_dict(rows)[columns]
drugbank_df2.head()

Unnamed: 0,drugbank_id,name,indication,pharmacodynamics,mechanism-of-action,toxicity,protein-binding
0,DB00001,Lepirudin,For the treatment of heparin-induced thrombocy...,Lepirudin is used to break up clots and to red...,Lepirudin forms a stable non-covalent complex ...,"In case of overdose (eg, suggested by excessiv...",
1,DB00002,Cetuximab,"Cetuximab, used in combination with irinotecan...","Used in the treatment of colorectal cancer, ce...",Cetuximab binds to the epidermal growth factor...,Pulmonary Toxicity\r\n\r\nInterstitial lung di...,
2,DB00003,Dornase alfa,Used as adjunct therapy in the treatment of cy...,Cystic fibrosis (CF) is a disease characterize...,Dornase alfa is a biosynthetic form of human D...,Adverse reactions occur at a frequency of < 1/...,
3,DB00004,Denileukin diftitox,For treatment of cutaneous T-cell lymphoma,Denileukin diftitox (Ontak) directs the cytoci...,Denileukin diftitox binds to the high-affinity...,,
4,DB00005,Etanercept,Etanercept is indicated for the treatment of m...,Etanercept binds specifically to tumor necrosi...,There are two distinct receptors for TNF (TNFR...,,


In [11]:
path = os.path.join('data', 'drugbank-toxicity.tsv')
drugbank_df2.to_csv(path, sep='\t', index=False)

In [35]:
df = pd.read_csv(path, sep='\t')
df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00006,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,Bivalirudin is a synthetic 20 residue peptide ...
1,DB00007,Leuprolide,small molecule,approved|investigational,L02AE51|L02AE02,Adrenal Cortex Hormones|Agents Causing Muscle ...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,Leuprolide belongs to the general class of dru...
2,DB00014,Goserelin,small molecule,approved,L02AE03,"Adrenal Cortex Hormones|Amino Acids, Peptides,...",BLCLNMBMMGCOAS-URPVMXJPSA-N,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,"Goserelin is a synthetic hormone. In men, it s..."
3,DB00027,Gramicidin D,small molecule,approved,R02AB30,"Amino Acids, Peptides, and Proteins|Anti-Bacte...",NDAYQJDHGXTBJL-MWWSRJDJSA-N,InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(11...,Gramcidin D is a heterogeneous mixture of thre...
4,DB00035,Desmopressin,small molecule,approved,H01BA02,"Agents that produce hypertension|Amino Acids, ...",NFLWUMRGJYTJIN-PNIOQBSNSA-N,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...,"Desmopressin (dDAVP), a synthetic analogue of ..."


In [38]:
drugbank_df['type'].value_counts()

small molecule    11355
biotech            2120
Name: type, dtype: int64

In [36]:
df[df['drugbank_id'] == "DB15091"]

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
2579,DB15091,Upadacitinib,small molecule,approved|investigational,,Antirheumatic Agents|BCRP/ABCG2 Inhibitors|BCR...,WYQFJHHDOKWSHR-MNOVXSKESA-N,InChI=1S/C17H19F3N6O/c1-2-10-7-25(16(27)24-9-1...,Upadacitinib is an oral Janus kinase (JAK)1-se...


## Extract protein information

In [43]:
protein_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    name= drug.findtext(ns + "name")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
        for protein in proteins:
            row = {'drugbank_id': drugbank_id, 'name': name, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(ns))
            row['known_action'] = protein.findtext('{}known-action'.format(ns))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
            row['actions'] = '|'.join(action.text for action in actions)
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
            if len(uniprot_ids) != 1:
                continue
            row['uniprot_id'] = uniprot_ids[0]
            pmids = []
            citations = []
            references = protein.find('{http://www.drugbank.ca}references')
            if references is not None:
                for article in references.find('{http://www.drugbank.ca}articles'):
                    pmid = article.find('{http://www.drugbank.ca}pubmed-id').text
                    text = article.find('{http://www.drugbank.ca}citation').text
    #                 evidence.append({'source': 'DrugBank', 'pmid': pmid, 'text': text})
                    if pmid is not None:
                        pmids.append(pmid)
                        citations.append(text)
            if len(pmids) > 0:
                row['pubmed_ids'] = '|'.join(pmids)
                row['citations'] = '|'.join(citations)
        
#             pmids = protein.findall("{ns}references/{ns}articles/{ns}article/{ns}pubmed-id".format(ns=ns))
#             if pmids is not None:
#                 row['pubmed_ids'] = '|'.join(pmid.text for pmid in pmids if pmid.text is not None)
            protein_rows.append(row)
#     break

protein_df = pandas.DataFrame.from_dict(protein_rows)
protein_df.head()

Unnamed: 0,drugbank_id,name,category,organism,known_action,actions,uniprot_id,pubmed_ids,citations
0,DB00001,Lepirudin,target,Humans,yes,inhibitor,P00734,10505536|10912644|11055889|11467439|11807012|1...,Turpie AG: Anticoagulants in acute coronary sy...
1,DB00002,Cetuximab,target,Humans,yes,antagonist,P00533,10480573|10601294|10628369|11408594|11431346|1...,"Hosokawa N, Yamamoto S, Uehara Y, Hori M, Tsuc..."
2,DB00002,Cetuximab,target,Humans,unknown,,O75015,16336752,"Snyder LC, Astsaturov I, Weiner LM: Overview o..."
3,DB00002,Cetuximab,target,Humans,unknown,,P00736,17139284|17016423,"Overington JP, Al-Lazikani B, Hopkins AL: How ..."
4,DB00002,Cetuximab,target,Humans,unknown,,P02745,17139284|17016423,"Overington JP, Al-Lazikani B, Hopkins AL: How ..."


In [44]:
path = os.path.join('data', 'drugbank-targets.tsv')
protein_df.to_csv(path, sep='\t', index=False)

In [32]:
protein_df.nunique()

actions          104
category           4
drugbank_id     7836
known_action       3
organism         541
uniprot_id      4981
dtype: int64

In [29]:
protein_df[protein_df['organism'] == 'Humans'].nunique()

actions          103
category           4
drugbank_id     6203
known_action       3
organism           1
uniprot_id      3001
dtype: int64

In [30]:
protein_df[protein_df['organism'] == 'Humans']['category'].value_counts()

target         14333
enzyme          4813
transporter     2718
carrier          689
Name: category, dtype: int64

In [None]:
evidence = []                                                                                                                                                                
references = target.find('{http://www.drugbank.ca}references')                                                                                                               
for article in references.find('{http://www.drugbank.ca}articles'):                                                                                                          
    pmid = article.find('{http://www.drugbank.ca}pubmed-id').text                                                                                                            
    text = article.find('{http://www.drugbank.ca}citation').text                                                                                                             
    evidence.append({'source': 'DrugBank', 'pmid': pmid, 'text': text})

## Try the CSV downloaded from drugbank(?)

In [12]:
drugbank_csv = "/data/jeff-law/projects/2020-03-covid-19/SARS-CoV-2-network-analysis/datasets/testing/drugbank/drug-links.csv"
df = pd.read_csv(drugbank_csv)
df.head()

Unnamed: 0,DrugBank ID,Name,CAS Number,Drug Type,KEGG Compound ID,KEGG Drug ID,PubChem Compound ID,PubChem Substance ID,ChEBI ID,PharmGKB ID,...,GenBank ID,DPD ID,RxList Link,Pdrhealth Link,Wikipedia ID,Drugs.com Link,NDC ID,ChemSpider ID,BindingDB ID,TTD ID
0,DB00001,Lepirudin,138068-37-8,BiotechDrug,,D06880,,46507011.0,,PA450195,...,,11916.0,http://www.rxlist.com/cgi/generic/lepirudin.htm,,Lepirudin,http://www.drugs.com/cdi/lepirudin.html,,,,DAP000541
1,DB00002,Cetuximab,205923-56-4,BiotechDrug,,D03455,,46507042.0,,PA10040,...,J00228,13175.0,http://www.rxlist.com/cgi/generic3/erbitux.htm,,Cetuximab,http://www.drugs.com/cdi/cetuximab.html,,,,DNC000788
2,DB00003,Dornase alfa,143831-71-4,BiotechDrug,,,,46507792.0,,PA10318,...,M55983,650.0,http://www.rxlist.com/cgi/generic/pulmozyme.htm,,Dornase_alfa,http://www.drugs.com/cdi/dornase-alfa.html,,,,DAP000981
3,DB00004,Denileukin diftitox,173146-27-5,BiotechDrug,,,,46506950.0,,PA164750594,...,V01536,,http://www.rxlist.com/cgi/generic2/denileukin.htm,,Denileukin_diftitox,http://www.drugs.com/cdi/denileukin-diftitox.html,,,,DAP001098
4,DB00005,Etanercept,185243-69-0,BiotechDrug,C07897,D00742,,46506732.0,,PA449515,...,M32315,12032.0,http://www.rxlist.com/cgi/generic/etanercept.htm,,Etanercept,http://www.drugs.com/cdi/etanercept.html,,,,DNC000605


In [15]:
print(df['DrugBank ID'].nunique())
print(df['Name'].nunique())
print(df['UniProt ID'].nunique())

13475
13475
66


In [18]:
print(df.columns)

Index(['DrugBank ID', 'Name', 'CAS Number', 'Drug Type', 'KEGG Compound ID',
       'KEGG Drug ID', 'PubChem Compound ID', 'PubChem Substance ID',
       'ChEBI ID', 'PharmGKB ID', 'HET ID', 'UniProt ID', 'UniProt Title',
       'GenBank ID', 'DPD ID', 'RxList Link', 'Pdrhealth Link', 'Wikipedia ID',
       'Drugs.com Link', 'NDC ID', 'ChemSpider ID', 'BindingDB ID', 'TTD ID'],
      dtype='object')
