## Drug Repurposing Hub Target Drug-Set Library
### Drug-set labels: Protein Targets
#### ALL DATABASES ACCESSED 08/01/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [27]:
import pandas as pd
import numpy as np
import csv
from collections import defaultdict
import os
import json

In [2]:
os.chdir('../../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../notebooks/DrugRepurposingHub')

## Import dataframe of targets matched to drugs & drug metadata
#### Database: https://clue.io/data/REP#REP
#### Input Files : repurposing_drugs_201809807.txt | repurposing_samples_20180907.txt

In [3]:
df_target = pd.read_table('input/repurposing_drugs_20180907.txt', delimiter = '\t', encoding='latin-1',
                         usecols = ['pert_iname','target'])
df_target = df_target.dropna(subset=['target'])

In [4]:
df_target.head()

Unnamed: 0,pert_iname,target
0,"[sar9,met(o2)11]-substance-p",TACR1
1,A-1070722,GSK3A|GSK3B
2,A-1120,RBP4
3,A-317491,P2RX3
5,A-366,EHMT1|EHMT2


Total unique drugs

In [5]:
len(df_target)

4484

#### Importing metadata

In [6]:
df_metadata = pd.read_csv('input/repurposing_samples_20180907.txt', delimiter = '\t', encoding='latin-1',
                         usecols = ['pert_iname','InChIKey','pubchem_cid'])

In [7]:
df_metadata.head(3)

Unnamed: 0,pert_iname,InChIKey,pubchem_cid
0,"[sar9,met(o2)11]-substance-p",OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0
1,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",LDLCEGCJYSDJLX-UPHRSURJSA-N,5846454.0
2,"1-(1,2-Diphenylethyl)piperidine-(+/-)",JQWJJJYHVHNXJH-UHFFFAOYSA-N,206666.0


In [8]:
# merging on pert_iname
df_target = df_target.merge(df_metadata)

In [9]:
df_target.head(3)

Unnamed: 0,pert_iname,target,InChIKey,pubchem_cid
0,"[sar9,met(o2)11]-substance-p",TACR1,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0
1,A-1070722,GSK3A|GSK3B,VQPBIJGXSXEOCU-UHFFFAOYSA-N,49830684.0
2,A-1120,RBP4,MEAQCLPMSVEOQF-UHFFFAOYSA-N,25138295.0


#### Importing Drugbank mapping file

In [10]:
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/pubchem.tsv', sep = '\t')
drugbank_mapping = drugbank_mapping.rename(columns = {'pubchem_id':'pubchem_cid','inchi_key':'InChIKey'})

In [11]:
drugbank_mapping.head(3)

Unnamed: 0,drugbank_id,pubchem_cid,InChIKey
0,DB00006,101041682,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00006,126480209,OIRCOABEOLEUMC-GEJPAHFPSA-N
2,DB00006,132229728,OIRCOABEOLEUMC-GEJPAHFPSA-N


In [12]:
# merging on pubchem_cid
df_target = df_target.merge(drugbank_mapping, how = 'inner', on = ['pubchem_cid', 'InChIKey'])

In [14]:
df_target.head()

Unnamed: 0,pert_iname,target,InChIKey,pubchem_cid,drugbank_id
0,"[sar9,met(o2)11]-substance-p",TACR1,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0,DB05875
1,A-1120,RBP4,MEAQCLPMSVEOQF-UHFFFAOYSA-N,25138295.0,DB06985
2,A-674563,AKT1|PKIA|PRKACA,BPNUQXPIQBZCMR-IBGZPJMESA-N,11314340.0,DB08568
3,abametapir,MMP9,PTRATZCAGVBFIQ-UHFFFAOYSA-N,15664.0,DB11932
4,ABC-294640,SPHK2,CAOTVXGYTWCKQE-UHFFFAOYSA-N,15604015.0,DB12764


Number of unique small molecules that were harmonized

In [15]:
len(set(df_target['InChIKey']))

2085

In [16]:
# Splitting "|" separated target rows into separate rows #
df_target = pd.DataFrame(df_target['target'].str.split('|').tolist(), index = df_target['InChIKey']).stack()
df_target = df_target.reset_index()[[0, 'InChIKey']]
df_target.columns = ['target','inchi_key']

In [17]:
df_target.head()

Unnamed: 0,target,inchi_key
0,TACR1,OUPXSLGGCPUZJJ-SARDKLJWSA-N
1,RBP4,MEAQCLPMSVEOQF-UHFFFAOYSA-N
2,AKT1,BPNUQXPIQBZCMR-IBGZPJMESA-N
3,PKIA,BPNUQXPIQBZCMR-IBGZPJMESA-N
4,PRKACA,BPNUQXPIQBZCMR-IBGZPJMESA-N


### Validating genes using lookup table
#### Lookup table generated from ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia

In [19]:
gene_resolver(df_target, columnName = 'target')

In [20]:
len(df_target)

16840

In [21]:
df_target.head()

Unnamed: 0,target,inchi_key,Approved Symbol
0,TACR1,OUPXSLGGCPUZJJ-SARDKLJWSA-N,TACR1
1,RBP4,MEAQCLPMSVEOQF-UHFFFAOYSA-N,RBP4
2,AKT1,BPNUQXPIQBZCMR-IBGZPJMESA-N,AKT1
3,PKIA,BPNUQXPIQBZCMR-IBGZPJMESA-N,PKIA
4,PRKACA,BPNUQXPIQBZCMR-IBGZPJMESA-N,PRKACA


In [22]:
id_dict = tuple(zip(df_target['Approved Symbol'],df_target['inchi_key']))

drugsetlibrary = defaultdict(list)
for k,v in id_dict:
    drugsetlibrary[k].append(v)
    
# Dropping duplicates and removing sets < 5
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v)) >= 5}

### Library counts

In [24]:
library_counts(drugsetlibrary)

1720 unique drugs
375 unique association terms
5841 unique associations
15.576 average drugs per term


### Exporting drugsetlibrary in GMT format

In [25]:
os.chdir('../../data/DrugRepurposingHub')

In [26]:
gmt_formatter(drugsetlibrary, 'DrugRepurposingHub_target_drugsetlibrary.gmt')