# DrugBank

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source: https://www.drugbank.ca/releases/latest

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os
from datetime import date
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [3]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [5]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

This notebook was run on: 2020-06-30 
Python version: 3.8.0 (default, Oct 28 2019, 16:14:01) 
[GCC 8.3.0]


# Initialization

In [6]:
%%appyter code_eval

{% set group = ChoiceField(
    name='identifier',
    label='Protein Identifier Group',
    choices=['Target', 'Enzyme', 'Carrier', 'Transporter'],
    default='Target',
    section='data'
) %}

### Load Mapping Dictionaries

In [7]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:09<00:00,  3.21s/it]


### Output Path

In [8]:
%%appyter code_exec

output_name = 'drugbank_' + '{{group}}'.lower()

path = 'Output/DrugBank-' + '{{group}}'
if not os.path.exists(path):
    os.makedirs(path)

```python
output_name = 'drugbank_' + 'Target'.lower()
path = 'Output/DrugBank-' + 'Target'
if not os.path.exists(path):
    os.makedirs(path)
```

In [9]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the DrugBank Protein Identifiers Datasets',
) %}

# Load Data

In [10]:
%%appyter code_exec

with zipfile.ZipFile({{FileField(
    constraint='.*\.csv.zip$',
    name='drug_identifiers', 
    label='Drug Identifiers Dataset (csv.zip)', 
    default='Input/DrugBank/drugbank_all_target_polypeptide_ids.csv.zip',
    section='data')
}}) as zipf:
  with zipf.open('all.csv') as f:
    df = pd.read_csv(f, usecols=['Gene Name', 'Drug IDs', 'Species'], index_col=0)

```python
with zipfile.ZipFile('Input/DrugBank/drugbank_all_target_polypeptide_ids.csv.zip') as zipf:
  with zipf.open('all.csv') as f:
    df = pd.read_csv(f, usecols=['Gene Name', 'Drug IDs', 'Species'], index_col=0)
```

In [11]:
df.head()

Unnamed: 0_level_0,Species,Drug IDs
Gene Name,Unnamed: 1_level_1,Unnamed: 2_level_1
ftsI,Haemophilus influenzae (strain ATCC 51907 / DS...,DB00303
HDC,Humans,DB00114; DB00117
GLS2,Humans,DB00142
F13A1,Humans,DB01839; DB02340; DB11300; DB11311; DB11571; D...
NOS2,Humans,DB00125; DB00155; DB01017; DB01110; DB01234; D...


In [12]:
df.shape

(5220, 2)

# Load Drug Metadata

In [13]:
%%appyter code_exec

drug_meta = pd.read_csv({{FileField(
    constraint='.*\.zip$',
    name='drug_metadata', 
    label='External Drug Links (csv.zip)', 
    default='Input/DrugBank/drugbank_all_drug_links.csv.zip',
    section='data')
}}, usecols=['DrugBank ID', 'Name'], index_col=0)

```python

drug_meta = pd.read_csv('Input/DrugBank/drugbank_all_drug_links.csv.zip', usecols=['DrugBank ID', 'Name'], index_col=0)
```

In [14]:
drug_meta.head()

Unnamed: 0_level_0,Name
DrugBank ID,Unnamed: 1_level_1
DB00001,Lepirudin
DB00002,Cetuximab
DB00003,Dornase alfa
DB00004,Denileukin diftitox
DB00005,Etanercept


In [15]:
drug_meta.shape

(13563, 1)

# Pre-process Data

## Get Relevant Data

In [16]:
# Get Relevant Species
df = df[np.logical_or.reduce([
    df['Species'] == 'Humans',
    df['Species'] == 'Mouse',
    df['Species'] == 'Rat'
])].drop('Species', axis=1)
df.head()

Unnamed: 0_level_0,Drug IDs
Gene Name,Unnamed: 1_level_1
HDC,DB00114; DB00117
GLS2,DB00142
F13A1,DB01839; DB02340; DB11300; DB11311; DB11571; D...
NOS2,DB00125; DB00155; DB01017; DB01110; DB01234; D...
HSD17B2,DB00157; DB13952; DB13953; DB13954; DB13955; D...


## Split Drug ID list

In [17]:
df['Drug IDs'] = df['Drug IDs'].map(lambda x: x.split('; '))
df = df.explode('Drug IDs').dropna()
df.head()

Unnamed: 0_level_0,Drug IDs
Gene Name,Unnamed: 1_level_1
HDC,DB00114
HDC,DB00117
GLS2,DB00142
F13A1,DB01839
F13A1,DB02340


In [18]:
df.shape

(16509, 1)

## Map Drug IDs to Names

In [19]:
df['Drug IDs'] = drug_meta.reindex(df['Drug IDs']).set_index(df.index)
df.index.name = 'Gene Symbol'
df.columns = ['Drug Name']
df.head()

Unnamed: 0_level_0,Drug Name
Gene Symbol,Unnamed: 1_level_1
HDC,Pyridoxal phosphate
HDC,Histidine
GLS2,Glutamic acid
F13A1,Propylene glycol
F13A1,N-Acetyl-Serine


# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [20]:
df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)
df.shape

100%|██████████| 16509/16509 [00:00<00:00, 386114.06it/s]


(16123, 1)

# Analyze Data

## Create Binary Matrix

In [21]:
binary_matrix = uf.binary_matrix(df)
binary_matrix.head()

Drug Name,(+)-2-(4-biphenyl)propionic acid,(+)-Rutamarin alcohol,"(1'R,2'S)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-yl)Adenine",(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)acetic acid,"(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)","(1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",(1-Methyl-1h-Imidazol-2-Yl)-(3-Methyl-4-{3-[(Pyridin-3-Ylmethyl)-Amino]-Propoxy}-Benzofuran-2-Yl)-Methanone,"(10R)-10-methyl-3-(6-methylpyridin-3-yl)-9,10,11,12-tetrahydro-8H-[1,4]diazepino[5',6':4,5]thieno[3,2-f]quinolin-8-one","(11S)-8-CHLORO-11-[1-(METHYLSULFONYL)PIPERIDIN-4-YL]-6-PIPERAZIN-1-YL-11H-BENZO[5,6]CYCLOHEPTA[1,2-B]PYRIDINE","(13R,15S)-13-METHYL-16-OXA-8,9,12,22,24-PENTAAZAHEXACYCLO[15.6.2.16,9.1,12,15.0,2,7.0,21,25]HEPTACOSA-1(24),2,4,6,17(25),18,20-HEPTAENE-23,26-DIONE",...,"{4-[(CARBOXYMETHOXY)CARBONYL]-3,3-DIOXIDO-1-OXONAPHTHO[1,2-D]ISOTHIAZOL-2(1H)-YL}ACETIC ACID","{4-[2,2-BIS(5-METHYL-1,2,4-OXADIAZOL-3-YL)-3-PHENYLPROPYL]PHENYL}SULFAMIC ACID",{4-[2-BENZYL-3-METHOXY-2-(METHOXYCARBONYL)-3-OXOPROPYL]PHENYL}SULFAMIC ACID,{4-[3-(4-acetyl-3-hydroxy-2-propylphenoxy)propoxy]phenoxy}acetic acid,"{4-[3-(6,7-Diethoxy-Quinazolin-4-Ylamino)-Phenyl]-Thiazol-2-Yl}-Methanol","{[(2,6-difluorophenyl)carbonyl]amino}-N-(4-fluorophenyl)-1H-pyrazole-3-carboxamide","{[2-(1h-1,2,3-Benzotriazol-1-Yl)-2-(3,4-Difluorophenyl)Propane-1,3-Diyl]Bis[4,1-Phenylene(Difluoromethylene)]}Bis(Phosphonic Acid)","{[2-Amino-4-oxo-6,7-di(sulfanyl-κS)-3,5,5a,8,9a,10-hexahydro-4H-pyrano[3,2-g]pteridin-8-yl]methyl dihydrogenato(2-) phosphate}(dioxo)sulfanylmolybdenum","{[5-(5-nitro-2-furyl)-1,3,4-oxadiazol-2-yl]thio}acetic acid",{[7-(Difluoro-Phosphono-Methyl)-Naphthalen-2-Yl]-Difluoro-Methyl}-Phosphonic Acid
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A2M,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AADACL2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AADAT,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AANAT,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
binary_matrix.shape

(2774, 5660)

In [23]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

## Create Gene List

In [24]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
gene_list.head()

100%|██████████| 2774/2774 [00:00<00:00, 209405.70it/s]


Unnamed: 0_level_0,Gene ID
Gene Symbol,Unnamed: 1_level_1
A1BG,1
A2M,2
AADACL2,344752
AADAT,51166
AANAT,15


In [25]:
gene_list.shape

(2774, 1)

In [26]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [27]:
attribute_list = uf.attribute_list(binary_matrix)
attribute_list.head()

(+)-2-(4-biphenyl)propionic acid
(+)-Rutamarin alcohol
"(1'R,2'S)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-yl)Adenine"
(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)acetic acid
"(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)"


In [28]:
attribute_list.shape

(5660, 0)

In [29]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [30]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

100%|██████████| 5660/5660 [00:00<00:00, 36503.22it/s]


In [31]:
uf.save_setlib(binary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

100%|██████████| 2774/2774 [00:00<00:00, 32229.74it/s]


## Create Attribute Similarity Matrix

In [32]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

Unnamed: 0,(+)-2-(4-biphenyl)propionic acid,(+)-Rutamarin alcohol,"(1'R,2'S)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-yl)Adenine",(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)acetic acid,"(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)","(1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",(1-Methyl-1h-Imidazol-2-Yl)-(3-Methyl-4-{3-[(Pyridin-3-Ylmethyl)-Amino]-Propoxy}-Benzofuran-2-Yl)-Methanone,"(10R)-10-methyl-3-(6-methylpyridin-3-yl)-9,10,11,12-tetrahydro-8H-[1,4]diazepino[5',6':4,5]thieno[3,2-f]quinolin-8-one","(11S)-8-CHLORO-11-[1-(METHYLSULFONYL)PIPERIDIN-4-YL]-6-PIPERAZIN-1-YL-11H-BENZO[5,6]CYCLOHEPTA[1,2-B]PYRIDINE","(13R,15S)-13-METHYL-16-OXA-8,9,12,22,24-PENTAAZAHEXACYCLO[15.6.2.16,9.1,12,15.0,2,7.0,21,25]HEPTACOSA-1(24),2,4,6,17(25),18,20-HEPTAENE-23,26-DIONE",...,"{4-[(CARBOXYMETHOXY)CARBONYL]-3,3-DIOXIDO-1-OXONAPHTHO[1,2-D]ISOTHIAZOL-2(1H)-YL}ACETIC ACID","{4-[2,2-BIS(5-METHYL-1,2,4-OXADIAZOL-3-YL)-3-PHENYLPROPYL]PHENYL}SULFAMIC ACID",{4-[2-BENZYL-3-METHOXY-2-(METHOXYCARBONYL)-3-OXOPROPYL]PHENYL}SULFAMIC ACID,{4-[3-(4-acetyl-3-hydroxy-2-propylphenoxy)propoxy]phenoxy}acetic acid,"{4-[3-(6,7-Diethoxy-Quinazolin-4-Ylamino)-Phenyl]-Thiazol-2-Yl}-Methanol","{[(2,6-difluorophenyl)carbonyl]amino}-N-(4-fluorophenyl)-1H-pyrazole-3-carboxamide","{[2-(1h-1,2,3-Benzotriazol-1-Yl)-2-(3,4-Difluorophenyl)Propane-1,3-Diyl]Bis[4,1-Phenylene(Difluoromethylene)]}Bis(Phosphonic Acid)","{[2-Amino-4-oxo-6,7-di(sulfanyl-κS)-3,5,5a,8,9a,10-hexahydro-4H-pyrano[3,2-g]pteridin-8-yl]methyl dihydrogenato(2-) phosphate}(dioxo)sulfanylmolybdenum","{[5-(5-nitro-2-furyl)-1,3,4-oxadiazol-2-yl]thio}acetic acid",{[7-(Difluoro-Phosphono-Methyl)-Naphthalen-2-Yl]-Difluoro-Methyl}-Phosphonic Acid
(+)-2-(4-biphenyl)propionic acid,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(+)-Rutamarin alcohol,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(1'R,2'S)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-yl)Adenine",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)acetic acid,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [38]:
net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
net.filter_N_top('row', rank_type='sum', N_top=300)
net.cluster()
net.widget()

clustergrammer_widget(network='{"row_nodes": [{"name": "3-bromo-5-phenyl-N-(pyrimidin-5-ylmethyl)pyrazolo[1,5-…

## Create Gene Similarity Matrix

In [35]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

Unnamed: 0,A1BG,A2M,AADACL2,AADAT,AANAT,AARS1,AARS2,AASS,ABAT,ABCA1,...,YES1,YWHAB,YWHAE,YWHAH,YWHAQ,YWHAZ,ZAP70,ZFY,ZNF160,ZYX
A1BG,1.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086957,0.0
AADACL2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AADAT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.2,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AANAT,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [37]:
edge_list = uf.edge_list(binary_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

The number of statisticaly relevent gene-attribute associations is: 16123


KeyboardInterrupt: 

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)