## Drugmonizome ETL: DrugCentral

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

#### Data Source: http://drugcentral.org/download

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import zipfile
import datetime

import pandas as pd
import numpy as np
import drugmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The example below was sourced from <a href="http://drugcentral.org/download" target="_blank">http://drugcentral.org/</a>. If clicking on the example does not work, it should be downloaded directly from the source website.',
    section='data'
) %}

{% set data_file = FileField(
    constraint='.*\.tsv.gz$',
    name='drug-target-interactions', 
    label='Drug-target interaction data (tsv.gz)', 
    default='drug.target.interaction.tsv.gz',
    examples={
        'drug.target.interaction.tsv.gz': 'http://unmtid-shinyapps.net/download/drug.target.interaction.tsv.gz'
    },
    section='data'
) %}

{% set metadata_file = FileField(
    constraint='.*\.tsv$',
    name='small_molecule_metadata', 
    label='Small molecule metadata (tsv)', 
    default='structures.smiles.tsv',
    examples={
        'structures.smiles.tsv': 'http://unmtid-shinyapps.net/download/structures.smiles.tsv'
    },
    section='data'
) %}

{% set entity_type = ChoiceField(
    name='entity_type',
    label='Choose identifier type for exported small molecules',
    choices=[
        'Name',
        'InChI Key',
    ],
    default='Name',
    section='data'
) %}

### Load Gene Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'drugcentral'
path = 'output/drugmonizome_drugcentral'
if not os.path.exists(path):
    os.makedirs(path)

### Load Drug-Target Interaction Data

In [None]:
%%appyter code_exec

df_data = pd.read_csv({{data_file}},
                 sep = '\t',
                 usecols=['GENE','DRUG_NAME','ORGANISM'])
df_data.head()

In [None]:
df_data.shape

### Splitting GENE Column

In [None]:
# Retain only human gene symbols
df_data = df_data[df_data['ORGANISM'] == 'Homo sapiens']

In [None]:
# Some small molecules interact with multiple targets and need to be split into multiple rows
df_data[df_data['GENE'] == 'CACNA1C|CACNA1D'].head(2)

In [None]:
df_data['GENE'] = df_data['GENE'].map(lambda x: x.split('|'))
df_data = df_data.explode('GENE')
df_data.head()

### Loading Small Molecule Metadata

In [None]:
%%appyter code_exec

df_meta = pd.read_csv({{metadata_file}},
                 sep = '\t',
                 usecols=['InChIKey', 'INN'])
df_meta.head()

In [None]:
df_meta.shape

### Match Metadata to Small Molecule Names

In [None]:
# Merging drug metadata
df_meta.rename(columns={'INN':'DRUG_NAME'}, inplace=True)
df_data = df_data.merge(df_meta)

In [None]:
df_data.head()

### Index dataframe by user selected small molecule identifier

In [None]:
%%appyter code_exec

{% if entity_type.raw_value == 'InChI Key' %}
# Index small molecules by InChI Key
df_output = df_data[['InChIKey','GENE']]
df_output.set_index('InChIKey', inplace = True)

{% else %}
# Index small molecules by name
df_output = df_data[['DRUG_NAME','GENE']]
df_output.set_index('DRUG_NAME', inplace = True)

{% endif %}

### Matching Gene Symbols to Approved Entrez Gene Symbols

In [None]:
df_output = uf.map_symbols(df_output, symbol_lookup)
df_output.head()

## Analyze Data

### Export Edge List

In [None]:
uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df_output)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
uf.archive(path)

### Link to the output folder: [Download](./output_archive.zip)