## DrugCentral Target Drug-Set Library
### Drug-set labels: Genes
#### ALL DATABASES ACCESSED 03/01/20
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import os
import csv
import json
import pandas as pd
from collections import defaultdict

In [2]:
os.chdir('../../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../notebooks/DrugCentral')

### Import target interaction data & drug metadata
#### Source : http://drugcentral.org/download

In [3]:
df_interactions = pd.read_csv('input/drug.target.interaction.tsv', sep = '\t', usecols = ['DRUG_NAME','ACCESSION',
                                                                             'GENE','ORGANISM'])
df_interactions.head()

Unnamed: 0,DRUG_NAME,ACCESSION,GENE,ORGANISM
0,levobupivacaine,P35499,SCN4A,Homo sapiens
1,levobupivacaine,P10635,CYP2D6,Homo sapiens
2,levobupivacaine,Q12809,KCNH2,Homo sapiens
3,levobupivacaine,Q9UK17,KCND3,Homo sapiens
4,levobupivacaine,P34995,PTGER1,Homo sapiens


In [4]:
df_meta = pd.read_csv('input/structures.smiles.tsv', sep = '\t')
df_meta.head()

Unnamed: 0,SMILES,InChI,InChIKey,ID,INN,CAS_RN
0,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,levobupivacaine,27262-47-1
1,COC(=O)C1=C(C)NC(C)=C([C@H]1C1=CC(=CC=C1)[N+](...,InChI=1S/C26H29N3O6/c1-17-22(25(30)34-4)24(20-...,ZBBHBTPTTSWHBA-DEOSSOPVSA-N,5,(S)-nicardipine,76093-36-2
2,CCOC(=O)C1=C(C)NC(C)=C([C@@H]1C1=CC(=CC=C1)[N+...,InChI=1S/C18H20N2O6/c1-5-26-18(22)15-11(3)19-1...,PVHUJELLJLJGLN-INIZCTEOSA-N,6,(S)-nitrendipine,80873-62-7
3,C[C@@H](CCC1=CC=C(O)C=C1)NCCC1=CC=C(O)C(O)=C1,InChI=1S/C18H23NO3/c1-13(2-3-14-4-7-16(20)8-5-...,JRWZLRBJNMZMFE-ZDUSSCGKSA-N,13,levdobutamine,61661-06-1
4,NC1=NC2=NC=C(CNC3=CC=C(C=C3)C(=O)N[C@@H](CCC(O...,InChI=1S/C19H20N8O5/c20-15-14-16(27-19(21)26-1...,TVZGACDUOSZQKY-LBPRGKRZSA-N,21,aminopterin,54-62-6


In [5]:
# Renaming drug column names to be the same and merging drug metadata
df_meta.rename(columns={'INN':'DRUG_NAME'}, inplace=True)
df_interactions = df_interactions.merge(df_meta) 

In [6]:
df_interactions.head()

Unnamed: 0,DRUG_NAME,ACCESSION,GENE,ORGANISM,SMILES,InChI,InChIKey,ID,CAS_RN
0,levobupivacaine,P35499,SCN4A,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,27262-47-1
1,levobupivacaine,P10635,CYP2D6,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,27262-47-1
2,levobupivacaine,Q12809,KCNH2,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,27262-47-1
3,levobupivacaine,Q9UK17,KCND3,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,27262-47-1
4,levobupivacaine,P34995,PTGER1,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,27262-47-1


### Resolving drugs to InChI Keys
In this case, CAS numbers are available as a mapping which will be the most resolute criteria for harmonization

In [7]:
df_drugbank = pd.read_csv('../../metadata/drugmonizome_metadata.tsv', sep = '\t', usecols = ['DrugBank ID','CAS',
                                                                                             'Standard InChI Key'])

In [8]:
df_drugbank.head()

Unnamed: 0,DrugBank ID,CAS,Standard InChI Key
0,DB00006,128270-60-0,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00007,53714-56-0,GFIJNRVAKGFPGQ-LIJARHBVSA-N
2,DB00014,65807-02-5,BLCLNMBMMGCOAS-URPVMXJPSA-N
3,DB00027,1405-97-6,NDAYQJDHGXTBJL-MWWSRJDJSA-N
4,DB00035,16679-58-6,NFLWUMRGJYTJIN-PNIOQBSNSA-N


In [9]:
len(set(df_interactions['DRUG_NAME']))

2174

In [10]:
# Mapping Drugbank IDs by shared CAS number
df_drugbank.rename(columns={'CAS':'CAS_RN'}, inplace=True)
df_interactions = df_drugbank.merge(df_interactions)

1895/2174 small molecules were mapped

In [11]:
len(set(df_interactions['DRUG_NAME']))

1895

In [12]:
df_interactions.head()

Unnamed: 0,DrugBank ID,CAS_RN,Standard InChI Key,DRUG_NAME,ACCESSION,GENE,ORGANISM,SMILES,InChI,InChIKey,ID
0,DB00006,128270-60-0,OIRCOABEOLEUMC-GEJPAHFPSA-N,bivalirudin,P00734,F2,Homo sapiens,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,OIRCOABEOLEUMC-GEJPAHFPSA-N,385
1,DB00007,53714-56-0,GFIJNRVAKGFPGQ-LIJARHBVSA-N,leuprorelin,P30968,GNRHR,Homo sapiens,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,1559
2,DB00007,53714-56-0,GFIJNRVAKGFPGQ-LIJARHBVSA-N,leuprorelin,P30969,Gnrhr,Rattus norvegicus,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,1559
3,DB00014,65807-02-5,BLCLNMBMMGCOAS-URPVMXJPSA-N,goserelin,P22888,LHCGR,Homo sapiens,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,BLCLNMBMMGCOAS-URPVMXJPSA-N,1327
4,DB00014,65807-02-5,BLCLNMBMMGCOAS-URPVMXJPSA-N,goserelin,P30968,GNRHR,Homo sapiens,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,BLCLNMBMMGCOAS-URPVMXJPSA-N,1327


### Resolve genes

In [13]:
# Retaining only human specific associations
df_human = df_interactions[df_interactions['ORGANISM'] == 'Homo sapiens']

Some genes are paired with multiple symbols

In [14]:
df_human[df_human['GENE'] == 'CACNA1C|CACNA1D'].head(2)

Unnamed: 0,DrugBank ID,CAS_RN,Standard InChI Key,DRUG_NAME,ACCESSION,GENE,ORGANISM,SMILES,InChI,InChIKey,ID
3608,DB00528,100427-26-7,ZDXUKAKRHYTAKV-UHFFFAOYSA-N,lercanidipine,Q01668|Q13936,CACNA1C|CACNA1D,Homo sapiens,COC(=O)C1=C(C)NC(C)=C(C1C1=CC=CC(=C1)[N+]([O-]...,InChI=1S/C36H41N3O6/c1-24-31(34(40)44-6)33(28-...,ZDXUKAKRHYTAKV-UHFFFAOYSA-N,4157
4437,DB00622,55985-32-5,ZBBHBTPTTSWHBA-UHFFFAOYSA-N,nicardipine,Q01668|Q13936,CACNA1C|CACNA1D,Homo sapiens,COC(=O)C1=C(C)NC(C)=C(C1C1=CC(=CC=C1)[N+]([O-]...,InChI=1S/C26H29N3O6/c1-17-22(25(30)34-4)24(20-...,ZBBHBTPTTSWHBA-UHFFFAOYSA-N,1909


In [15]:
# Splitting instances of multiple genes into separate rows
df_human_split = pd.DataFrame(df_human['GENE'].str.split('|').tolist(), index=df_human['Standard InChI Key']).stack()
df_human_split = df_human_split.reset_index()[[0, 'Standard InChI Key']]
df_human_split.columns = ['Gene', 'Drug'] 

In [16]:
gene_resolver(df_human_split, columnName = 'Gene')

In [17]:
df_human_split.head()

Unnamed: 0,Gene,Drug,Approved Symbol
0,F2,OIRCOABEOLEUMC-GEJPAHFPSA-N,F2
1,GNRHR,GFIJNRVAKGFPGQ-LIJARHBVSA-N,GNRHR
2,LHCGR,BLCLNMBMMGCOAS-URPVMXJPSA-N,LHCGR
3,GNRHR,BLCLNMBMMGCOAS-URPVMXJPSA-N,GNRHR
4,OXTR,NFLWUMRGJYTJIN-PNIOQBSNSA-N,OXTR


### Creating drugsetlibrary and exporting

In [18]:
target_dict = tuple(zip(df_human_split['Approved Symbol'].tolist(),df_human_split['Drug'].tolist()))
drugsetlibrary = defaultdict(list)
for gene,drug in target_dict:
    drugsetlibrary[gene].append(drug)

In [19]:
# Removing duplicates any terms not paired with at least 5 drugs 
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=5}

In [20]:
os.chdir('../../data/DrugCentral')

In [21]:
gmt_formatter(drugsetlibrary, 'DrugCentral_target_drugsetlibrary.gmt')

### Library counts

In [22]:
library_counts(drugsetlibrary)

1555 unique drugs
540 unique association terms
10350 unique associations
19.166666666666668 average drugs per term
