## Drugmonizome ETL : STITCH
### Each chemical-protein interaction is scored on a scale from 150 to 1000 based on the confidence of the interaction

### The interaction list combined score cut-off is specified by the user and ranges from 500-900

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

#### Data source : http://stitch.embl.de/cgi/download.pl?UserId=ptwsbc4REEdD&sessionId=lWA43YSvJUJa&species_text=Homo+sapiens

In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import datetime
import zipfile

import pandas as pd
import numpy as np
import drugmonizome.utility_functions as uf
import harmonizome.lookup as lookup

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Load Gene Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The example below was sourced from <a href="http://stitch.embl.de/cgi/download.pl?UserId=ptwsbc4REEdD&sessionId=dcHW1Qojl0oP&species_text=Homo+sapiens" target="_blank">http://stitch.embl.de/</a>. If clicking on the example does not work, it should be downloaded directly from the source website.',
    section='data'
) %}

{% set links_file = FileField(
    constraint='.*\.tsv.gz$',
    name='Protein-Chemical Links', 
    label='Protein-Chemical Links (tsv.gz)', 
    default='9606.protein_chemical.links.v5.0.tsv.gz',
    examples={
        '9606.protein_chemical.links.v5.0.tsv.gz': 'http://stitch.embl.de/download/protein_chemical.links.v5.0/9606.protein_chemical.links.v5.0.tsv.gz'
    },
    section='data'
) %}

{% set metadata_file = FileField(
    constraint='.*\.tsv.gz$',
    name='Human Entrez to STRING', 
    label='Mapping File of STRING Protein IDs to Human Entrez Gene Symbols (tsv.gz)', 
    default='human.name_2_string.tsv.gz',
    examples={
        'human.name_2_string.tsv.gz': 'https://string-db.org/mapping_files/STRING_display_names/human.name_2_string.tsv.gz'
    },
    section='data'
) %}

{% set entity_type = ChoiceField(
    name='entity_type',
    label='Choose identifier type for exported small molecules',
    choices=[
        'Name',
        'InChI Key',
    ],
    default='Name',
    section='data'
) %}

{% set score_cutoff = ChoiceField(
    name='score_cutoff',
    label='Choose confidence score cutoff for protein-chemical interactions',
    choices=[
        500,
        600,
        700,
        800,
        900,
    ],
    default= 500,
    section='data'
) %}

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'STITCH'
path = 'output/drugmonizome_STITCH_' + str({{score_cutoff.raw_value}})
if not os.path.exists(path):
    os.makedirs(path)

### Load Protein-Chemical Links Data

In [None]:
%%appyter code_exec

df_data = pd.read_csv({{links_file}}, sep='\t')
df_data.head()

In [None]:
df_data.shape

### Load Protein Metadata

In [None]:
%%appyter code_exec

df_meta = pd.read_csv({{metadata_file}},
            sep='\t',
            header = 0,
            names = ['NCBI_taxid','gene_name','protein'],
            usecols = ['gene_name','protein'])
df_meta.head()

In [None]:
df_meta.shape

### Match Entrez Gene Symbols to STRING protein IDs

In [None]:
df_data = df_data.merge(df_meta)
df_data.head()

In [None]:
df_data.shape

### Match STITCH chemical ids to DrugBank IDs
#### Source files / scripts for mapping files: https://github.com/MaayanLab/Drugmonizome/tree/master/drugsetlibraries/metadata

In [None]:
# Converting STITCH compound identifier to PubChem ID
df_data['chemical'] = df_data['chemical'].apply(lambda x: int(x[4:]))
df_data.head()

In [None]:
# Import DrugBank mapping file
drugbank_mapping = pd.read_csv('https://raw.githubusercontent.com/MaayanLab/Drugmonizome/master/drugsetlibraries/metadata/mapping_files/pubchem.tsv',
                               sep = '\t')
drugbank_mapping.head()

In [None]:
df_data = df_data.merge(drugbank_mapping, left_on = 'chemical', right_on = 'pubchem_id')
df_data.head()

### Filtering dataframe by user-specified cut-off and removing duplicates

In [None]:
%%appyter code_exec
df_data = df_data.loc[df_data['combined_score'] >= {{score_cutoff.raw_value}}]
df_data.drop_duplicates()
df_data.head()

In [None]:
df_data.drop_duplicates(inplace=True)

In [None]:
df_data

### Index dataframe by user selected small molecule identifier

In [None]:
%%appyter code_exec

{% if entity_type.raw_value == 'InChI Key' %}
# Index small molecules by InChI Key
df_output = df_data[['inchi_key','gene_name']]
df_output.set_index('inchi_key', inplace = True)

{% else %}
# Index small molecules by name
df_output = df_data[['name','gene_name']]
df_output['name'] = df_output['name'].str.lower()
df_output.set_index('name', inplace = True)

{% endif %}

### Matching Gene Symbols to Approved Entrez Gene Symbols

In [None]:
df_output = uf.map_symbols(df_output, symbol_lookup)
df_output.head()

## Analyze Data

### Export Edge List

In [None]:
uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df_output)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
uf.archive(path)

### Link to the output folder: [Download](./output_archive.zip)