## Drugmonizome ETL : DrugBank Proteins

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

#### Data source : https://www.drugbank.ca/releases/latest#protein-identifiers

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import zipfile
import datetime

import pandas as pd
import numpy as np
import drugmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The example below was sourced from <a href="https://www.drugbank.ca/releases/latest#protein-identifiers" target="_blank">drugbank.ca</a>. If clicking on the example does not work, it should be downloaded directly from the source website.',
    section='data'
) %}

{% set data_file = FileField(
    constraint='.*\.csv.zip$',
    name='protein_identifiers', 
    label='Protein Dataset (csv.zip)', 
    default='drugbank_all_target_polypeptide_ids.csv.zip',
    examples={
        'drugbank_all_target_polypeptide_ids.csv.zip': 'https://www.drugbank.ca/releases/5-1-7/downloads/target-all-polypeptide-ids'
    },
    section='data'
) %}

{% set metadata_file = FileField(
    constraint='.*\.csv.zip$',
    name='drug_metadata', 
    label='Drug Metadata (csv.zip)', 
    default='drugbank_all_target_polypeptide_ids.csv.zip',
    examples={
        'drugbank_all_drugbank_vocabulary.csv.zip': 'https://www.drugbank.ca/releases/5-1-7/downloads/all-drugbank-vocabulary'
    },
    section='data'
) %}

{% set entity_type = ChoiceField(
    name='entity_type',
    label='Choose identifier type for exported small molecules',
    choices=[
        'Name',
        'InChI Key',
    ],
    default='Name',
    section='data'
) %}

### Load Gene Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'drugbank_' + {{ data_file }}.split('_')[2]
path = 'output/drugmonizome_drugbank_' + {{ data_file }}.split('_')[2]
if not os.path.exists(path):
    os.makedirs(path)

### Load Data

In [None]:
%%appyter code_exec

with zipfile.ZipFile({{data_file}}) as zipf:
    with zipf.open('all.csv') as f:
        df = pd.read_csv(f, usecols=['Gene Name', 'Drug IDs', 'Species'])
        
# Dropping all non-human gene names 
df = df[df['Species'].str.contains('Humans', na = False)].drop('Species', axis = 1)
df.head()

In [None]:
df.shape

### Matching Gene Symbols to Approved Entrez Gene Symbols

In [None]:
df = uf.map_symbols(df, symbol_lookup)
df.shape

### Splitting Drug IDs Column

In [None]:
df['Drug IDs'] = df['Drug IDs'].map(lambda x: x.split('; '))
df = df.explode('Drug IDs').dropna()
df.head()

### Loading Drug Metadata

In [None]:
%%appyter code_exec

with zipfile.ZipFile({{metadata_file}}) as zipf:
    with zipf.open('drugbank vocabulary.csv') as f:
        drugbank_mapping = pd.read_csv(f, usecols=['DrugBank ID', 'Common name', 'Standard InChI Key'])

In [None]:
drugbank_mapping['Common name'] = drugbank_mapping['Common name'].str.lower()
drugbank_mapping = drugbank_mapping[drugbank_mapping['Standard InChI Key'].notna()]

drugbank_mapping.head()

### Mapping Drug IDs 

In [None]:
df = df.merge(drugbank_mapping, left_on = 'Drug IDs', right_on = 'DrugBank ID')

In [None]:
df.head()

### Index dataframe by user selected small molecule identifier

In [None]:
%%appyter code_exec

{% if entity_type.raw_value == 'InChI Key' %}
# Index small molecules by InChI Key
df_output = df[['Standard InChI Key','Gene Name']]
df_output.set_index('Standard InChI Key', inplace = True)

{% else %}
# Index small molecules by name
df_output = df[['Common name','Gene Name']]
df_output.set_index('Common name', inplace = True)

{% endif %}

In [None]:
df_output.head()

## Analyze Data

### Export Edge List

In [None]:
uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df_output)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
uf.archive(path)

### Link to the output folder: [Download](./output_archive.zip)