## Drugmonizome ETL : CREEDS

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

#### Data source : https://amp.pharm.mssm.edu/CREEDS/

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import zipfile
import datetime
import json

import pandas as pd
import numpy as np
import drugmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The example below was sourced from <a href="https://amp.pharm.mssm.edu/CREEDS/#downloads" target="_blank">https://amp.pharm.mssm.edu/CREEDS/</a>. If clicking on the example does not work, it should be downloaded directly from the source website.',
    section='data'
) %}

{% set data_file = FileField(
    constraint='.*\.json$',
    name='signatures', 
    label='Drug-induced signatures (.json)', 
    default='single_drug_perturbations-v1.0.json',
    examples={
        'single_drug_perturbations-v1.0.json': 'https://amp.pharm.mssm.edu/CREEDS/download/single_drug_perturbations-v1.0.json'
    },
    section='data'
) %}

{% set entity_type = ChoiceField(
    name='entity_type',
    label='Choose identifier type for exported small molecules',
    choices=[
        'Name',
        'InChI Key',
    ],
    default='Name',
    section='data'
) %}

### Load Gene Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'CREEDS'
path_down = 'output/drugmonizome_CREEDS_downregulated'
path_up = 'output/drugmonizome_CREEDS_upregulated'
if not os.path.exists(path_down):
    os.makedirs(path_down)

if not os.path.exists(path_up):
    os.makedirs(path_up)

### Load Data

In [None]:
%%appyter code_exec

with open({{data_file}}, 'r') as f:
    data = json.load(f)
    
data[0].keys()

### Extract human drug-gene interactions

In [None]:
edgelist_down = []
edgelist_up = []

for entry in data:
    if entry['organism'] == 'human':
        drug = entry['drugbank_id']
        gene_list = []
        drugbank_list = []
        for gene in entry['down_genes']:
            gene_list.append(gene[0])
            drugbank_list.append(drug)
        edgelist_down.extend(zip(drugbank_list,gene_list)) # create edgelist of drug-gene interactions
        
        gene_list = []
        drugbank_list = []
        for gene in entry['up_genes']:
            gene_list.append(gene[0])
            drugbank_list.append(drug)
        edgelist_up.extend(zip(drugbank_list,gene_list))

df_down = pd.DataFrame(data = edgelist_down, columns = ['drugbank_id','gene'])
df_up = pd.DataFrame(data = edgelist_up, columns = ['drugbank_id','gene'])

### Map DrugBank IDs to Names and InChIKeys

In [None]:
#Import drugbank mapping file
drugbank_mapping = pd.read_csv('https://raw.githubusercontent.com/MaayanLab/Drugmonizome/master/drugsetlibraries/metadata/mapping_files/drugbank.tsv',
                              sep = '\t')

In [None]:
df_down = df_down.merge(drugbank_mapping)
df_up = df_up.merge(drugbank_mapping)

### Index Dataframe by selected small molecule identifier

In [None]:
%%appyter code_exec

{% if entity_type.raw_value == 'InChI Key' %}
# Index small molecules by InChI Key
df_down = df_down[['inchi_key','gene']]
df_down.set_index('inchi_key', inplace = True)

df_up = df_up[['inchi_key','gene']]
df_up.set_index('inchi_key', inplace = True)

{% else %}
# Index small molecules by name
df_down = df_down[['name','gene']]
df_down.set_index('name', inplace = True)

df_up = df_up[['name','gene']]
df_up.set_index('name', inplace = True)

{% endif %}

### Matching Gene Symbols to Approved Entrez Gene Symbols

In [None]:
df_down = uf.map_symbols(df_down, symbol_lookup)
df_down.shape

In [None]:
df_up = uf.map_symbols(df_up, symbol_lookup)
df_up.shape

## Analyze Data

### Export Edge List

In [None]:
uf.save_data(df_down, path_down, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

In [None]:
uf.save_data(df_up, path_up, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix_down = uf.binary_matrix(df_down)
binary_matrix_down.shape

In [None]:
binary_matrix_up = uf.binary_matrix(df_up)
binary_matrix_down.shape

In [None]:
uf.save_data(binary_matrix_down, path_down, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

In [None]:
uf.save_data(binary_matrix_up, path_up, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix_down, 'drug', path_down, output_name + '_drug_setlibrary')
uf.save_setlib(binary_matrix_up, 'drug', path_up, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix_down, 'attribute', path_down, output_name + '_attribute_setlibrary')
uf.save_setlib(binary_matrix_up, 'attribute', path_up, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix_down = uf.similarity_matrix(binary_matrix_down.T, 'jaccard', sparse=True)
attribute_similarity_matrix_up = uf.similarity_matrix(binary_matrix_up.T, 'jaccard', sparse=True)

In [None]:
uf.save_data(attribute_similarity_matrix_down, path_down,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)
uf.save_data(attribute_similarity_matrix_up, path_up,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix_down = uf.similarity_matrix(binary_matrix_down, 'jaccard', sparse=True)
drug_similarity_matrix_up = uf.similarity_matrix(binary_matrix_up, 'jaccard', sparse=True)

In [None]:
uf.save_data(drug_similarity_matrix_down, path_down,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)
uf.save_data(drug_similarity_matrix_up, path_up,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
zipf = zipfile.ZipFile("output_archive.zip", "w" )
paths = [path_down, path_up]

for path in paths:
    for root, _, files in os.walk(path):
        for f in files:
            zipf.write(os.path.join(root, f))

### Link to the output folder: [Download](./output_archive.zip)