## Drugmonizome ETL: SIDER 
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu
#### Data Source: http://sideeffects.embl.de/download/

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import datetime
import zipfile

import pandas as pd
import numpy as np
import drugmonizome.utility_functions as uf

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The example below was sourced from <a href="http://sideeffects.embl.de" target="_blank">http://sideeffects.embl.de/download</a>. If clicking on the example does not work, it should be downloaded directly from the source website.',
    section='data'
) %}

{% set data_file = FileField(
    constraint='.*\.tsv.gz$',
    name='drug_attribute', 
    label='Drug Attribute Data (.tsv.gz)', 
    default='meddra_all_se.tsv.gz',
    examples={
        'meddra_all_se.tsv.gz': 'http://sideeffects.embl.de/media/download/meddra_all_se.tsv.gz'
    },
    section='data'
) %}


{% set entity_type = ChoiceField(
    name='entity_type',
    label='Choose identifier type for exported small molecules',
    choices=[
        'Name',
        'InChI Key',
    ],
    default='Name',
    section='data'
) %}

{% set group = ChoiceField(
    name='identifier',
    label='Choose attribute type',
    description='This will be used for the output file names.',
    choices=['Side Effects',
             'Indications'
            ],
    default='Side Effects',
    section='data'
) %}

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'SIDER_' + '{{ group }}'.lower()
path = 'output/drugmonizome_SIDER_' + '{{ group }}'.lower()
if not os.path.exists(path):
    os.makedirs(path)

### Load Data

In [None]:
%%appyter code_exec
{% if data_file.raw_value == 'meddra_all_se.tsv.gz' %}
df = pd.read_csv({{data_file}}, 
                 delimiter = '\t',
                 names = ['STITCH_ID_FLAT','STITCH_ID_STEREO','UMLS_ID_Label',
                          'MedDRA_Concept_Type', 'UMLS_ID_MedDRA', 'Attribute'],
                usecols = ['STITCH_ID_FLAT','MedDRA_Concept_Type','Attribute'])
df['Attribute'] = df['Attribute'].str.lower()
df.dropna(inplace = True)

df.head()


{% elif data_file.raw_value == 'meddra_all_indications.tsv.gz' %}
df = pd.read_csv({{data_file}}, 
                 delimiter = '\t',
                 names = ['STITCH_ID_FLAT','UMLS_ID_Label','Detection_Method',
                          'MedDRA_Concept_Name','MedDRA_Concept_Type','UMLS_ID_MedDRA',
                          'Attribute'],
                usecols = ['STITCH_ID_FLAT', 'MedDRA_Concept_Type', 'Attribute'])
df['Attribute'] = df['Attribute'].str.lower()
df.dropna(inplace = True)

df.head()

{% endif %}

In [None]:
df.shape

In [None]:
# Use the MedDRA preferred term for each side effect name
df = df[~df['MedDRA_Concept_Type'].str.contains("LLT")].drop('MedDRA_Concept_Type', axis=1)

### Mapping STITCH IDs to Drugbank IDs

In [None]:
# Converting STITCH IDs to Pubchem IDs
df['STITCH_ID_FLAT'] = df['STITCH_ID_FLAT'].apply(lambda x: int(x[4:]))

In [None]:
drugbank_mapping = pd.read_csv('https://raw.githubusercontent.com/MaayanLab/Drugmonizome/master/drugsetlibraries/metadata/mapping_files/pubchem.tsv',
                               sep = '\t')
drugbank_mapping.head()

In [None]:
# Merge the two dataframes so that a DrugBank ID is matched to each STITCH ID
df = df.merge(drugbank_mapping, left_on = 'STITCH_ID_FLAT', right_on = 'pubchem_id')
df.drop_duplicates(inplace = True)
df.head()

### Index dataframe by user-selected small molecule identifier

In [None]:
%%appyter code_exec

{% if entity_type.raw_value == 'InChI Key' %}
# Index small molecules by InChI Key
df_output = df[['inchi_key','Attribute']]
df_output.set_index('inchi_key', inplace = True)

{% else %}
# Index small molecules by name
df_output = df[['name','Attribute']]
df_output['name'] = df_output['name'].str.lower()
df_output.set_index('name', inplace = True)

{% endif %}

In [None]:
df_output.head()

## Analyze Data

### Export Edge List

In [None]:
uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df_output)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
uf.archive(path)

### Link to the output folder: [Download](./output_archive.zip)