# DrugBank

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source: https://www.drugbank.ca/releases/latest

In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import sys
import os
from datetime import date
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

# Initialization

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

{% do SectionField(
    name='settings',
    title='Settings',
    img='setting_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The examples below were sourced from <a href="https://www.drugbank.ca/releases/latest" target="_blank">www.drugbank.ca</a>. The download requires a login so clicking on the examples may not work, in which case they should be downloaded directly from the source website.',
    section='data'
) %}

{% set df_file = FileField(
    constraint='.*\.csv.zip$',
    name='drug_identifiers', 
    label='Drug Identifiers Dataset (csv.zip)', 
    default='Input/DrugBank/drugbank_all_target_polypeptide_ids.csv.zip',
    examples={
        'drugbank_all_target_polypeptide_ids.csv.zip': 'https://www.drugbank.ca/releases/5-1-7/downloads/target-all-polypeptide-ids'
    },
    section='data'
) %}

{% set drug_meta = FileField(
    constraint='.*\.zip$',
    name='drug_metadata', 
    label='External Drug Links (csv.zip)', 
    default='Input/DrugBank/drugbank_all_drug_links.csv.zip',
    examples={
        'drugbank_all_drug_links.csv.zip': 'https://www.drugbank.ca/releases/5-1-7/downloads/all-drug-links'
    },
    section='data'
) %}

In [None]:
%%appyter code_eval

{% set group = ChoiceField(
    name='identifier',
    label='Protein Identifier Group',
    description='This will be used for the output file names.',
    choices=['Target', 'Enzyme', 'Carrier', 'Transporter'],
    default='Target',
    section='settings'
) %}

### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Output Path

In [None]:
%%appyter code_exec

output_name = 'drugbank_' + '{{group}}'.lower()

path = 'Output/DrugBank-' + '{{group}}'
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

In [None]:
%%appyter code_exec

with zipfile.ZipFile({{df_file}}) as zipf:
  with zipf.open('all.csv') as f:
    df = pd.read_csv(f, usecols=['Gene Name', 'Drug IDs', 'Species'], index_col=0)

In [None]:
df.head()

In [None]:
df.shape

# Load Drug Metadata

In [None]:
%%appyter code_exec

drug_meta = pd.read_csv(
    {{drug_meta}}, 
    usecols=['DrugBank ID', 'Name'], index_col=0
)

In [None]:
drug_meta.head()

In [None]:
drug_meta.shape

# Pre-process Data

## Get Relevant Data

In [None]:
# Get Relevant Species
df = df[np.logical_or.reduce([
    df['Species'] == 'Humans',
    df['Species'] == 'Mouse',
    df['Species'] == 'Rat'
])].drop('Species', axis=1)
df.head()

## Split Drug ID list

In [None]:
df['Drug IDs'] = df['Drug IDs'].map(lambda x: x.split('; '))
df = df.explode('Drug IDs').dropna()
df.head()

In [None]:
df.shape

## Map Drug IDs to Names

In [None]:
df['Drug IDs'] = drug_meta.reindex(df['Drug IDs']).set_index(df.index)
df.index.name = 'Gene Symbol'
df.columns = ['Drug Name']
df.head()

# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)
df.shape

# Analyze Data

## Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

## Create Gene List

In [None]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(binary_matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
net.filter_N_top('row', rank_type='sum', N_top=300)
net.cluster()
net.widget()

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(binary_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)