## Drug Repurposing Hub Indication Drug-Set Library
### Drug-set labels: Indications
#### ALL DATABASES ACCESSED 03/01/20
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
import numpy as np
import csv
from collections import defaultdict
import os
import json

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/DrugRepurposingHub')

## Import dataframe of targets matched to drugs & drug metadata
#### Database: https://clue.io/data/REP#REP
#### Input Files : repurposing_drugs_201809807.txt | repurposing_samples_20180907.txt

In [4]:
df = pd.read_table('input/repurposing_drugs_20180907.txt', delimiter = '\t', encoding='latin-1',
                         usecols = ['pert_iname','indication'])
df = df.dropna(subset=['indication'])

In [5]:
df.head()

Unnamed: 0,pert_iname,indication
24,abacavir,human immunodeficiency virus (HIV-1)
26,abamectin,gastrointestinal parasites
29,abemaciclib,breast cancer
30,abiraterone,prostate cancer
31,abiraterone-acetate,prostate cancer


Total unique drugs

In [7]:
len(df)

2086

#### Importing metadata

In [8]:
df_metadata = pd.read_csv('input/repurposing_samples_20180907.txt', delimiter = '\t', encoding='latin-1',
                         usecols = ['pert_iname','InChIKey','pubchem_cid'])

In [9]:
df_metadata.head(3)

Unnamed: 0,pert_iname,InChIKey,pubchem_cid
0,"[sar9,met(o2)11]-substance-p",OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0
1,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",LDLCEGCJYSDJLX-UPHRSURJSA-N,5846454.0
2,"1-(1,2-Diphenylethyl)piperidine-(+/-)",JQWJJJYHVHNXJH-UHFFFAOYSA-N,206666.0


In [10]:
# merging on pert_iname
df = df.merge(df_metadata)

In [11]:
df.head(3)

Unnamed: 0,pert_iname,indication,InChIKey,pubchem_cid
0,abacavir,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0
1,abacavir,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0
2,abacavir,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0


#### Importing Drugbank mapping file

In [12]:
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/pubchem.tsv', sep = '\t')
drugbank_mapping = drugbank_mapping.rename(columns = {'pubchem_id':'pubchem_cid','inchi_key':'InChIKey'})

In [13]:
drugbank_mapping.head(3)

Unnamed: 0,drugbank_id,pubchem_cid,InChIKey
0,DB00006,101041682,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00006,126480209,OIRCOABEOLEUMC-GEJPAHFPSA-N
2,DB00006,132229728,OIRCOABEOLEUMC-GEJPAHFPSA-N


In [15]:
# merging on pubchem_cid
df_indication = df.merge(drugbank_mapping, how = 'inner', on = ['pubchem_cid', 'InChIKey'])

In [16]:
df_indication.head()

Unnamed: 0,pert_iname,indication,InChIKey,pubchem_cid,drugbank_id
0,abacavir,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0,DB01048
1,abacavir,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0,DB01048
2,abacavir,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0,DB01048
3,abemaciclib,breast cancer,UZWDCWONPYILKI-UHFFFAOYSA-N,46220502.0,DB12001
4,abiraterone,prostate cancer,GZOSMCIZMLWJML-VJLLXTKPSA-N,132971.0,DB05812


Number of unique small molecules that were harmonized

In [18]:
len(set(df_indication['InChIKey']))

1561

In [19]:
# Splitting "|" separated target rows into separate rows #
df_indication = pd.DataFrame(df_indication['indication'].str.split('|').tolist(), index = df_indication['InChIKey']).stack()
df_indication = df_indication.reset_index()[[0, 'InChIKey']]
df_indication.columns = ['indication','inchi_key']

In [20]:
df_indication.head()

Unnamed: 0,indication,inchi_key
0,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N
1,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N
2,human immunodeficiency virus (HIV-1),MCGSCOLBFJQGHM-SCZZXKLOSA-N
3,breast cancer,UZWDCWONPYILKI-UHFFFAOYSA-N
4,prostate cancer,GZOSMCIZMLWJML-VJLLXTKPSA-N


In [22]:
id_dict = tuple(zip(df_indication['indication'],df_indication['inchi_key']))

drugsetlibrary = defaultdict(list)
for k,v in id_dict:
    drugsetlibrary[k].append(v)
    
# Dropping duplicates and removing sets < 5
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v)) >= 5}

### Library counts

In [23]:
library_counts(drugsetlibrary)

1283 unique drugs
155 unique association terms
2074 unique associations
13.380645161290323 average drugs per term


### Exporting drugsetlibrary in GMT format

In [24]:
os.chdir('../../data/DrugRepurposingHub')

In [25]:
gmt_formatter(drugsetlibrary, 'DrugRepurposingHub_indication_drugsetlibrary.gmt')