## Drugmonizome ETL: PharmGKB

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

#### Data Source: https://www.pharmgkb.org/downloads

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import zipfile
import datetime

import pandas as pd
import numpy as np
import drugmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The example below was sourced from <a href="https://www.pharmgkb.org/downloads" target="_blank">https://www.pharmgkb.org/</a>. If clicking on the example does not work, it should be downloaded directly from the source website.',
    section='data'
) %}

{% set data_file = FileField(
    constraint='.*\.zip$',
    name='drug-attribute-relationships', 
    label='Drug-attribute relationship data (.zip)', 
    default='relationships.zip',
    examples={
        'relationships.zip': 'https://s3.pgkb.org/data/relationships.zip'
    },
    section='data'
) %}

{% set entity_type = ChoiceField(
    name='entity_type',
    label='Choose identifier type for exported small molecules',
    choices=[
        'Name',
        'InChI Key',
    ],
    default='Name',
    section='data'
) %}

{% set group = ChoiceField(
    name='identifier',
    label='Choose attribute group',
    description='This will be used for the output file names.',
    choices=['gene',
             'variant'
            ],
    default='gene',
    section='data'
) %}

In [None]:
%%appyter markdown

{% if group.value == 'gene' %}
### Load Gene Mapping Dictionaries
{% else %} 
{% endif %}

In [None]:
%%appyter code_exec

{% if group.value == 'gene' %}
symbol_lookup, geneid_lookup = lookup.get_lookups()
{% else %} 
{% endif %}

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'pharmgkb'
path = 'output/drugmonizome_pharmgkb'
if not os.path.exists(path):
    os.makedirs(path)

### Load Drug-Variant Data

In [None]:
%%appyter code_exec

with zipfile.ZipFile({{data_file}}) as zipf:
    with zipf.open('relationships.tsv') as f:
        df = pd.read_csv(f,
                              delimiter = '\t',
                             usecols = ['Entity1_type','Entity1_name',
                                       'Entity2_type', 'Entity2_id',
                                       'Association'])
df.head()

In [None]:
df.shape

In [None]:
%%appyter code_exec
{% if group.value == 'variant' %}
# Retaining relevant associations
df = df[(df['Entity1_type'] == 'Haplotype') | (df['Entity1_type'] == 'Variant')]
df = df[df['Entity2_type'] == 'Chemical']
df = df[df['Association'] == 'associated']
df.head()
{% elif group.value == 'gene' %}
df = df[(df['Entity1_type'] == 'Gene')]
df = df[df['Entity2_type'] == 'Chemical']
df = df[df['Association'] == 'associated']
df.head()
{% endif %}

### Match PharmGKB chemical ids to DrugBank drugs
#### Source files / scripts for mapping files: https://github.com/MaayanLab/Drugmonizome/tree/master/drugsetlibraries/metadata

In [None]:
# Import DrugBank mapping file
drugbank_mapping = pd.read_csv('https://raw.githubusercontent.com/MaayanLab/Drugmonizome/master/drugsetlibraries/metadata/mapping_files/pharmgkb.tsv',
                               sep = '\t')
drugbank_mapping['name'] = drugbank_mapping['name'].str.lower()
drugbank_mapping.head()

In [None]:
df = df.merge(drugbank_mapping, left_on = 'Entity2_id', right_on = 'pharmgkb_id')
df.head()

### Index dataframe by user selected small molecule identifier

In [None]:
%%appyter code_exec

{% if entity_type.raw_value == 'InChI Key' %}
# Index small molecules by InChI Key
df_output = df[['inchi_key','Entity1_name']]
df_output.set_index('inchi_key', inplace = True)

{% else %}
# Index small molecules by name
df_output = df[['name','Entity1_name']]
df_output.set_index('name', inplace = True)

{% endif %}

In [None]:
%%appyter markdown
{% if group.value == 'gene' %}
### Matching Gene Symbols to Approved Entrez Gene Symbols
{% else %}
{% endif %}

In [None]:
%%appyter code_exec
{% if group.value == 'gene' %}
df_output = uf.map_symbols(df_output, symbol_lookup)
df_output.shape
{% else %}
{% endif %}

## Analyze Data

### Export Edge List

In [None]:
uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df_output)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
uf.archive(path)

### Link to the output folder: [Download](./output_archive.zip)