# Convert the DrugBank XML databse to JSON and extract features

Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module.

In [120]:
import os
import csv
import gzip
import collections
import re
import io
import json
import xml.etree.ElementTree as ET
import requests
import pandas
import xmltodict
import json

In [3]:
xml_path = "data/full_database.xml"
json_path = "data/full_database.json"

In [2]:
# Read the XML file

with open('data/full_database.xml', encoding="UTF8") as f:
    db = xmltodict.parse(f.read())

json_obj = json.dumps(db, indent=4)

# output as json
with open("data/full_database.json", "w") as outfile:
    outfile.write(json_obj)

## Collect all the desired features

In [173]:
desired_props_exp = set(["Water Solubility",
            "Melting Point",
            "Boiling Point",
            "logP",
            "logS",
            "Hydrophobicity",
            "Isoelectric Point",
            "caco2 Permeability",
            "pKa",
            "Molecular Weight",
            "Radioactivity"])

desired_props_calc = set(["logP",
            "logS",
            "Water Solubility",
            "Molecular Weight",
            "Monoisotopic Weight",
            "Polar Surface Area (PSA)",
            "Refractivity",
            "Polarizability",
            "Rotatable Bond Count",
            "H Bond Acceptor Count",
            "H Bond Donor Count",
            "pKa (strongest acidic)",
            "pKa (strongest basic)",
            "Physiological Charge",
            "Number of Rings",
            "Bioavailability",
            "Rule of Five",
            "Ghose Filter",
            "MDDR-Like Rule",
            "Veber's Rule"])

def getProperties(desired_props, props, row):
    for prop in desired_props:
        if prop not in row:
            row[prop] = None

    try:
        for prop in props:
            if(prop['kind'] in desired_props):
                match = re.search(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", prop['value'])
                row[prop['kind']] = float(match.group(0))
    except:
        pass

In [12]:
with open(json_path) as f:
    data = json.load(f)

In [174]:
rows = []
for i in range(15235):
    row = {}
    drug = data['drugbank']['drug'][i]
    row['name'] = drug['name']
    row['state'] = drug.get('state', None)
    atc_code = None
    try:
        atc_code = drug.get('atc-codes', dict()).get('atc-code', None)
        atc_code = atc_code[0]
    except:
        pass

    row['level4'] = None
    row['level3'] = None
    row['level2'] = None
    row['level1'] = None
    try:
        row['level4'] = atc_code['level'][0]['@code']
        row['level3'] = atc_code['level'][1]['@code']
        row['level2'] = atc_code['level'][2]['@code']
        row['level1'] = atc_code['level'][3]['@code']
    except:
        pass

    
    try:
        exp_props = drug['experimental-properties']['property']
    except:
        exp_props = None
    getProperties(desired_props_exp, exp_props, row)

    try:
        calc_props = drug['calculated-properties']['property']
    except:
        calc_props = None
    getProperties(desired_props_calc, calc_props, row)

    rows.append(row)

In [175]:
drugbank_df = pandas.DataFrame.from_dict(rows)
drugbank_df.to_csv("data/full_database.csv")
drugbank_df

Unnamed: 0,name,state,level4,level3,level2,level1,Hydrophobicity,Boiling Point,Molecular Weight,Isoelectric Point,...,Polar Surface Area (PSA),Veber's Rule,pKa (strongest basic),Ghose Filter,Monoisotopic Weight,MDDR-Like Rule,Polarizability,H Bond Acceptor Count,Physiological Charge,Rule of Five
0,Lepirudin,solid,B01AE,B01A,B01,B,,,,,...,,,,,,,,,,
1,Cetuximab,liquid,L01FE,L01F,L01,L,-0.413,,145781.6000,8.48,...,,,,,,,,,,
2,Dornase alfa,liquid,R05CB,R05C,R05,R,-0.083,,29253.9000,4.58,...,,,,,,,,,,
3,Denileukin diftitox,liquid,L01XX,L01X,L01,L,-0.301,,57647.3000,5.45,...,,,,,,,,,,
4,Etanercept,liquid,L04AB,L04A,L04,L,-0.529,,51234.9000,7.89,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15230,AUM-601,,,,,,,,,,...,,,,,,,,,,
15231,FN-1501,,,,,,,,431.5040,,...,,,,,431.218206,,,,,
15232,Tinengotinib,,,,,,,,394.8600,,...,,,,,394.130887,,,,,
15233,Lipotecan,,,,,,,,850.7100,,...,,,,,850.183062,,,,,


## Use the rows which have the least missing features

In [181]:
threshold = 10
df = drugbank_df.dropna(thresh=drugbank_df.shape[1] - threshold + 1)
df = df.dropna(axis=1, thresh=df.shape[0]-1000+1)

In [182]:
df.isna().sum()

name                          0
state                       549
level4                       35
level3                       35
level2                       35
level1                       35
Molecular Weight              0
logP                          0
Water Solubility              5
logS                         27
Bioavailability               0
pKa (strongest acidic)      394
Refractivity                  0
Number of Rings               0
H Bond Donor Count            0
Rotatable Bond Count          0
Polar Surface Area (PSA)      0
pKa (strongest basic)       110
Ghose Filter                  0
Monoisotopic Weight           0
MDDR-Like Rule                0
Polarizability                0
H Bond Acceptor Count         0
Physiological Charge          0
Rule of Five                  0
dtype: int64

## extract drug-drug interaction information and dump to a json file

In [183]:
df.to_csv('data/filtered_dataset.csv')

In [184]:
interactions = {}
# get the set of drugs in the filtered df
drugs = set(df["name"])

for i in range(15235):
    drug = data['drugbank']['drug'][i]
    
    if drug.get("name", None) in drugs:
        try:
            interactions[drug['name']] = [x['name'] for x in drug['drug-interactions']["drug-interaction"] if x['name'] in drugs]
        except:
            interactions[drug['name']] = []

In [185]:
json_obj = json.dumps(interactions, indent=4)

# output as json
with open("data/interactions.json", "w") as outfile:
    outfile.write(json_obj)