## DrugRepurposingHub Mechanism of Action Drug-Set Library
### Drug-set labels: Mechanism of Action
#### ALL DATABASES ACCESSED 08/01/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
import numpy as np
import csv
from collections import defaultdict
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/DrugRepurposingHub')

### Import dataframe of mechanisms matched to drugs & drug metadata
#### Database: https://clue.io/data/REP#REP
#### Input Files : repurposing_drugs_201809807.txt | repurposing_samples_20180907

#### Importing mechanisms of action dataframe

In [3]:
df = pd.read_table('input/repurposing_drugs_20180907.txt', delimiter = '\t', encoding='latin-1')
df.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
1,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A|GSK3B,,
2,A-1120,Preclinical,retinoid receptor ligand,RBP4,,
3,A-317491,Preclinical,purinergic receptor antagonist,P2RX3,,
4,A-33903,Phase 2,,,,


In [4]:
df = df.dropna(subset=['moa'])

In [5]:
len(df)

5564

In [6]:
# Retaining only pert_iname and moa columns #
df = df.drop(['clinical_phase','target','disease_area','indication'], axis=1)

#### Importing metadata

In [7]:
df_metadata = pd.read_csv('input/repurposing_samples_20180907.txt', delimiter = '\t', encoding='latin-1',
                         usecols = ['pert_iname','InChIKey','pubchem_cid'])

In [8]:
df_metadata.head(3)

Unnamed: 0,pert_iname,InChIKey,pubchem_cid
0,"[sar9,met(o2)11]-substance-p",OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0
1,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",LDLCEGCJYSDJLX-UPHRSURJSA-N,5846454.0
2,"1-(1,2-Diphenylethyl)piperidine-(+/-)",JQWJJJYHVHNXJH-UHFFFAOYSA-N,206666.0


In [9]:
# merging on pert_iname
df = df.merge(df_metadata)

In [10]:
df.head(3)

Unnamed: 0,pert_iname,moa,InChIKey,pubchem_cid
0,"[sar9,met(o2)11]-substance-p",tachykinin antagonist,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0
1,A-1070722,glycogen synthase kinase inhibitor,VQPBIJGXSXEOCU-UHFFFAOYSA-N,49830684.0
2,A-1120,retinoid receptor ligand,MEAQCLPMSVEOQF-UHFFFAOYSA-N,25138295.0


#### Importing Drugbank mapping file

In [12]:
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/pubchem.tsv', sep = '\t')
drugbank_mapping = drugbank_mapping.rename(columns = {'pubchem_id':'pubchem_cid','inchi_key':'InChIKey'})

In [13]:
drugbank_mapping.head(3)

Unnamed: 0,drugbank_id,pubchem_cid,InChIKey
0,DB00006,101041682,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00006,126480209,OIRCOABEOLEUMC-GEJPAHFPSA-N
2,DB00006,132229728,OIRCOABEOLEUMC-GEJPAHFPSA-N


In [14]:
# merging on pubchem_cid
df_moa = df.merge(drugbank_mapping, how = 'inner', on = ['pubchem_cid','InChIKey'])

In [15]:
df_moa.head(3)

Unnamed: 0,pert_iname,moa,InChIKey,pubchem_cid,drugbank_id
0,"[sar9,met(o2)11]-substance-p",tachykinin antagonist,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0,DB05875
1,A-1120,retinoid receptor ligand,MEAQCLPMSVEOQF-UHFFFAOYSA-N,25138295.0,DB06985
2,A-674563,AKT inhibitor,BPNUQXPIQBZCMR-IBGZPJMESA-N,11314340.0,DB08568


In [16]:
# Splitting "|" separated moa rows into separate rows #
df_moa = pd.DataFrame(df_moa['moa'].str.split('|').tolist(), index = df_moa['InChIKey']).stack()
df_moa = df_moa.reset_index()[[0, 'InChIKey']]
df_moa.columns = ['moa','inchi_key']

In [17]:
df_moa.head()

Unnamed: 0,moa,inchi_key
0,tachykinin antagonist,OUPXSLGGCPUZJJ-SARDKLJWSA-N
1,retinoid receptor ligand,MEAQCLPMSVEOQF-UHFFFAOYSA-N
2,AKT inhibitor,BPNUQXPIQBZCMR-IBGZPJMESA-N
3,nucleoside reverse transcriptase inhibitor,MCGSCOLBFJQGHM-SCZZXKLOSA-N
4,nucleoside reverse transcriptase inhibitor,MCGSCOLBFJQGHM-SCZZXKLOSA-N


In [18]:
id_dict = tuple(zip(df_moa['moa'].tolist(), df_moa['inchi_key']))

drugsetlibrary = defaultdict(list)
for k,v in id_dict:
    drugsetlibrary[k].append(v)
    
# Dropping duplicates and sets <5
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>= 5}

### Library counts

In [19]:
library_counts(drugsetlibrary)

1854 unique drugs
154 unique association terms
2060 unique associations
13.376623376623376 average drugs per term


### Exporting drugsetlibrary in GMT format

In [20]:
os.chdir('../../data/DrugRepurposingHub')

In [21]:
gmt_formatter(drugsetlibrary, 'DrugRepurposingHub_moa_drugsetlibrary.gmt')