## Drugmonizome ETL : ATC Codes

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu
##### Adapted from : https://github.com/dhimmel/drugbank/blob/gh-pages/parse.ipynb

#### Data source : https://www.drugbank.ca/releases/latest

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import datetime
import zipfile
import collections
import xml.etree.ElementTree as ET

import pandas as pd
import numpy as np
import drugmonizome.utility_functions as uf

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Create Output Path

In [None]:
output_name = 'atc_codes'
path = 'output/drugmonizome_atc_codes'
if not os.path.exists(path):
    os.makedirs(path)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The example below was sourced from <a href="https://www.drugbank.ca/releases/latest" target="_blank">drugbank.ca</a>. If clicking on the example does not work, it should be downloaded directly from the source website.',
    section='data'
) %}

{% set xml_file = FileField(
    constraint='.*\.xml.zip$',
    name='atc codes', 
    label='ATC Codes (xml.zip)', 
    default='drugbank_all_full_database.xml.zip',
    examples={
        'drugbank_all_full_database.xml.zip': 'https://www.drugbank.ca/releases/5-1-7/downloads/all-full-database'
    },
    section='data'
) %}

{% set entity_type = ChoiceField(
    name='entity_type',
    label='Choose identifier type for exported small molecules',
    choices=[
        'Name',
        'InChI Key',
    ],
    default='Name',
    section='data'
) %}

### Load data

In [None]:
%%appyter code_exec

with zipfile.ZipFile({{xml_file}}) as zipf:
    with zipf.open('full database.xml') as f:
        tree = ET.parse(f)
root = tree.getroot()

### Matching ATC codes to small molecules

In [None]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name").lower()
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['inchi_key'] = drug.findtext(inchikey_template.format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [None]:
columns = ['drugbank_id', 'name', 'inchi_key', 'type', 'atc_codes']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df.head(10)

In [None]:
# Retaining only small molecules
df_atc = drugbank_df[drugbank_df['type'] == 'small molecule']

# Splitting lists into separate rows
df_atc = df_atc.explode('atc_codes').dropna()

# Removing last two characters from ATC Codes to get the fourth level code that drugs can be grouped under
df_atc['atc_codes'] = df_atc['atc_codes'].map(lambda x: x[0:5])

# Remove empty rows
df_atc = df_atc[df_atc['atc_codes'] != '']

In [None]:
df_atc.head(10)

In [None]:
# Duplicates can be seen in the above frame, therefore we should drop them
df_atc = df_atc.drop_duplicates()

### Index dataframe by user selected small molecule identifier

In [None]:
%%appyter code_exec

{% if entity_type.raw_value == 'InChI Key' %}
# Index small molecules by InChI Key
df_output = df_atc[['inchi_key','atc_codes']]
df_output.set_index('inchi_key', inplace = True)

{% else %}
# Index small molecules by name
df_output = df_atc[['name','atc_codes']]
df_output.set_index('name', inplace = True)

{% endif %}

## Analyze Data

### Export Edge List

In [None]:
uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df_output)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
uf.archive(path)

### Link to the output folder: [Download](./output_archive.zip)