## Drugmonizome ETL: Drug Repurposing Hub

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

#### Data Source: https://clue.io/data/REP#REP

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import datetime
import zipfile

import pandas as pd
import numpy as np
import drugmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The example below was sourced from <a href="https://clue.io/repurposing#download-data" target="_blank">https://clue.io/repurposing#home</a>. If clicking on the example does not work, it should be downloaded directly from the source website.',
    section='data'
) %}

{% set data_file = FileField(
    constraint='.*\.txt$',
    name='drug_attribute', 
    label='Drug Attribute Data (.txt)', 
    default='repurposing_drugs_20200324.txt',
    examples={
        'repurposing_drugs_20200324.txt': 'https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20200324.txt'
    },
    section='data'
) %}

{% set metadata_file = FileField(
    constraint='.*\.txt$',
    name='drug_metadata', 
    label='Drug Metadata (.txt)', 
    default='repurposing_samples_20200324.txt',
    examples={
        'repurposing_samples_20200324.txt': 'https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_samples_20200324.txt'
    },
    section='data'
) %}

{% set entity_type = ChoiceField(
    name='entity_type',
    label='Choose identifier type for exported small molecules',
    choices=[
        'Name',
        'InChI Key',
    ],
    default='Name',
    section='data'
) %}

{% set group = ChoiceField(
    name='identifier',
    label='Choose attribute group',
    description='This will be used for the output file names.',
    choices=['target',
             'moa'
            ],
    default='target',
    section='data'
) %}

In [None]:
%%appyter markdown

{% if group.value == 'target' %}
### Load Gene Mapping Dictionaries
{% else %} 
{% endif %}

In [None]:
%%appyter code_exec

{% if group.value == 'target' %}
symbol_lookup, geneid_lookup = lookup.get_lookups()
{% else %} 
{% endif %}

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'drugrepurposinghub_' + '{{ group }}'
path = 'output/drugmonizome_drugrepurposinghub_' + '{{ group }}'
if not os.path.exists(path):
    os.makedirs(path)

### Load Data

In [None]:
%%appyter code_exec
df_data = pd.read_table({{data_file}},
                        delimiter = '\t',
                        encoding ='latin-1',
                        skiprows = 9,
                        usecols = ['pert_iname','{{group}}'])
df_data['pert_iname'] = df_data['pert_iname'].str.lower()
df_data.head()

In [None]:
df_data.shape

In [None]:
%%appyter markdown
### Splitting {{group}} column

In [None]:
%%appyter code_exec
df_data['{{ group }}'].dropna(inplace = True)
df_data['{{ group }}'] = df_data['{{ group }}'].map(lambda x: x.split('|'))
df_data = df_data.explode('{{ group }}').dropna()
df_data.shape

### Load Small Molecule Metadata

In [None]:
%%appyter code_exec
df_meta = pd.read_table({{metadata_file}},
                        delimiter = '\t',
                        encoding ='latin-1',
                        skiprows = 9,
                        usecols = ['pert_iname','InChIKey'])
df_meta.head()

### Match metadata to df_data

In [None]:
df_data = df_data.merge(df_meta, on = 'pert_iname')
df_data.head()

### Index dataframe by user selected small molecule identifier

In [None]:
%%appyter code_exec

{% if entity_type.value == 'InChI Key' %}
# Index small molecules by InChI Key
df_output = df_data[['InChIKey','{{ group }}']]
df_output.set_index('InChIKey', inplace = True)

{% else %}
# Index small molecules by name
df_output = df_data[['pert_iname','{{ group }}']]
df_output.set_index('pert_iname', inplace = True)

{% endif %}

In [None]:
%%appyter markdown
{% if group.value == 'target' %}
### Matching Gene Symbols to Approved Entrez Gene Symbols
{% else %}
{% endif %}

In [None]:
%%appyter code_exec
{% if group.value == 'target' %}
df_output = uf.map_symbols(df_output, symbol_lookup)
df_output.shape
{% else %}
{% endif %}

## Analyze Data

### Export Edge List

In [None]:
uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df_output)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
uf.archive(path)

### Link to the output folder: [Download](./output_archive.zip)