## ATC DRUG-SET LIBRARY
### Drug-set labels: ATC Codes
#### ALL DATABASES ACCESSED 08/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu
##### Adapted from : https://github.com/dhimmel/drugbank/blob/gh-pages/parse.ipynb

In [1]:
import os
import csv
import collections
import re
import io
import json
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/ATC')

### Matching ATC codes to drug names
#### Database for ATC Codes and paired drugs : https://www.drugbank.ca/releases/latest

In [3]:
tree = ET.parse('input/full database.xml')
root = tree.getroot()

In [4]:
ns = '{http://www.drugbank.ca}'

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [5]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [6]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df.head(10)

Unnamed: 0,drugbank_id,name,type,groups,atc_codes
0,DB00001,Lepirudin,biotech,approved,B01AE02
1,DB00002,Cetuximab,biotech,approved,L01XC06
2,DB00003,Dornase alfa,biotech,approved,R05CB13
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01
5,DB00006,Bivalirudin,small molecule,approved|investigational,B01AE06
6,DB00007,Leuprolide,small molecule,approved|investigational,L02AE51|L02AE02
7,DB00008,Peginterferon alfa-2a,biotech,approved|investigational,L03AB11|L03AB61
8,DB00009,Alteplase,biotech,approved,B01AD02|S01XA13
9,DB00010,Sermorelin,biotech,approved|withdrawn,V04CD03|H01AC04


In [7]:
# Retaining only small molecules
drugbank_df = drugbank_df[drugbank_df['type'] == 'small molecule']

# Splitting rows with multiple ATC Codes into separate rows
df_atc = pd.DataFrame(drugbank_df['atc_codes'].str.split('|').tolist(), index=drugbank_df['drugbank_id']).stack()
df_atc = df_atc.reset_index()[[0, 'drugbank_id']]
df_atc.columns = ['atc_codes', 'DrugBank ID'] 

# Removing last two characters from ATC Codes to get the fourth level code that drugs can be grouped under
df_atc['atc_codes'] = df_atc['atc_codes'].map(lambda x: x[0:5])

# Remove empty rows
df_atc = df_atc[df_atc['atc_codes'] != '']

In [8]:
df_atc.head(10)

Unnamed: 0,atc_codes,DrugBank ID
0,B01AE,DB00006
1,L02AE,DB00007
2,L02AE,DB00007
3,L02AE,DB00014
4,R02AB,DB00027
5,H01BA,DB00035
6,H01CC,DB00050
8,J01XX,DB00080
9,L04AD,DB00091
10,S01XA,DB00091


### Mapping DrugBank IDs to InChI Key

In [9]:
drugbank_mapping = pd.read_csv('../../metadata/drugmonizome_metadata.tsv', sep = '\t', usecols = ['DrugBank ID',
                                                                                                 'Standard InChI Key'])

In [10]:
drugbank_mapping.head()

Unnamed: 0,DrugBank ID,Standard InChI Key
0,DB00006,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00007,GFIJNRVAKGFPGQ-LIJARHBVSA-N
2,DB00014,BLCLNMBMMGCOAS-URPVMXJPSA-N
3,DB00027,NDAYQJDHGXTBJL-MWWSRJDJSA-N
4,DB00035,NFLWUMRGJYTJIN-PNIOQBSNSA-N


In [11]:
df_atc = drugbank_mapping.merge(df_atc)

In [12]:
df_atc.head(3)

Unnamed: 0,DrugBank ID,Standard InChI Key,atc_codes
0,DB00006,OIRCOABEOLEUMC-GEJPAHFPSA-N,B01AE
1,DB00007,GFIJNRVAKGFPGQ-LIJARHBVSA-N,L02AE
2,DB00007,GFIJNRVAKGFPGQ-LIJARHBVSA-N,L02AE


### Creating drugsetlibrary and exporting

In [13]:
id_list = tuple(zip(df_atc['atc_codes'].tolist(),df_atc['Standard InChI Key'].tolist()))
    
    
drugsetlibrary = defaultdict(list)
for k,v in id_list:
    drugsetlibrary[k].append(v)

# removing duplicates & sets < 5
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v)) >= 5} 

In [14]:
len(drugsetlibrary)

308

In [15]:
os.chdir('../../data/ATC')

In [16]:
# Export drugsetlibrary in gmt format
gmt_formatter(drugsetlibrary, 'ATC_drugsetlibrary.gmt')

### Library counts

In [17]:
library_counts(drugsetlibrary)

2233 unique drugs
308 unique association terms
3054 unique associations
9.915584415584416 average drugs per term
