## Import Libraries

In [8]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugBank Data

In [9]:
df = pd.read_csv('Input/drugbank_targets.csv')

In [10]:
df.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00001,Lepirudin,BiotechDrug,P00734,Prothrombin
1,DB00002,Cetuximab,BiotechDrug,P00533,Epidermal growth factor receptor
2,DB00002,Cetuximab,BiotechDrug,O75015,Low affinity immunoglobulin gamma Fc region re...
3,DB00002,Cetuximab,BiotechDrug,P00736,Complement C1r subcomponent
4,DB00002,Cetuximab,BiotechDrug,P02745,Complement C1q subcomponent subunit A


In [11]:
df.shape

(18655, 5)

In [12]:
df['Type'].unique()

array(['BiotechDrug', 'SmallMoleculeDrug'], dtype=object)

In [13]:
len(df['Name'].unique())

7137

#### Load UniProt ID table

In [14]:
uniprot = pd.read_table('Input/gene_to_uniprot.txt')

In [15]:
uniprot.head()

Unnamed: 0,Approved Symbol,UniProt ID
0,A1BG,P04217
1,A1BG-AS1,
2,A1CF,Q9NQ94
3,A2M,P01023
4,A2M-AS1,


In [16]:
uniprot.shape

(41375, 2)

In [17]:
uniprot = uniprot[pd.notnull(uniprot['UniProt ID'])]

#### Load Targets Mapping File

In [18]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [19]:
target_update.head()

Unnamed: 0,Old Targets,Updated Targets
0,A1BG,A1BG
1,A1BG-AS1,A1BG-AS1
2,NCRNA00181,A1BG-AS1
3,A1BGAS,A1BG-AS1
4,A1BG-AS,A1BG-AS1


In [20]:
target_update.set_index('Old Targets',inplace = True)

#### Load LINCS Small Molecules

In [21]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

In [22]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Map UniProt ID to Gene

In [23]:
df.set_index('UniProt ID', inplace = True)
uniprot.set_index('UniProt ID',inplace=True)

In [24]:
uniprot.head()

Unnamed: 0_level_0,Approved Symbol
UniProt ID,Unnamed: 1_level_1
P04217,A1BG
Q9NQ94,A1CF
P01023,A2M
A8K2U0,A2ML1
U3KPV4,A3GALT2


In [25]:
df = pd.merge(df, uniprot, how= 'left', on = 'UniProt ID')

In [26]:
df = df.rename(index=str, columns = {'Approved Symbol':'Old Targets'})

In [27]:
df.reset_index(inplace=True)

In [28]:
df.head(4)
df.shape

(18702, 6)

In [29]:
df = df.dropna(subset=['Old Targets']) 

## Update Target Names

In [30]:
df.set_index('Old Targets', inplace = True)

In [31]:
df.head()

Unnamed: 0_level_0,UniProt ID,DrugBank ID,Name,Type,UniProt Name
Old Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F2,P00734,DB00001,Lepirudin,BiotechDrug,Prothrombin
Egfr,P00533,DB00002,Cetuximab,BiotechDrug,Epidermal growth factor receptor
FCGR3B,O75015,DB00002,Cetuximab,BiotechDrug,Low affinity immunoglobulin gamma Fc region re...
C1R,P00736,DB00002,Cetuximab,BiotechDrug,Complement C1r subcomponent
C1QA,P02745,DB00002,Cetuximab,BiotechDrug,Complement C1q subcomponent subunit A


In [32]:
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')

In [33]:
df.reset_index(inplace=True)

In [34]:
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})

In [35]:
df = df.dropna(subset=['Targets']) 

In [36]:
df.shape

(14343, 7)

## Get PubChemID and Map to drug name

In [37]:
namesdf = df['Name']
namesdf = namesdf.drop_duplicates()

In [38]:
nameslist = namesdf.tolist()
failed_to_get_CID = 0
CID_dict = {}

for name in nameslist:
    name = name.replace(' ','%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        name = name.replace('%20', ' ')
        CID_dict[name] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 100 == 0:
        name = name.replace('%20',' ')
        print(nameslist.index(name))

print(failed_to_get_CID)
print(len(CID_dict))
print(len(nameslist))

    

0
1
2
3
172
278
384
490
595
717
825
927
1041
1042
1043
1159
1267
1369
1476
1585
1586
1587
1698
1806
1807
1922
2031
2032
2151
2267
2383
2490
2609
2715
2825
2992
3174
3175
3176
3177
3324
3460
3579
3693
3803
3922
4042
4043
4044
4160
4278
4400
4505
4627
4742
4850
4979
5105
5248
883
4478
5361


In [42]:
CIDs = []
for index, row in df.iterrows():
    drugname = row.loc['Name']
    if drugname not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[drugname]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

In [43]:
df.head()

Unnamed: 0,Old Targets,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Targets,CIDs
30,F2,P00734,DB00006,Bivalirudin,SmallMoleculeDrug,Prothrombin,F2,16129704
31,GNRHR,P30968,DB00007,Leuprolide,BiotechDrug,Gonadotropin-releasing hormone receptor,GNRHR,657181
38,GHRHR,Q02643,DB00010,Sermorelin,BiotechDrug,Growth hormone-releasing hormone receptor,GHRHR,16129620
52,LHCGR,P22888,DB00014,Goserelin,SmallMoleculeDrug,Lutropin-choriogonadotropic hormone receptor,LHCGR,5311128
53,GNRHR,P30968,DB00014,Goserelin,SmallMoleculeDrug,Gonadotropin-releasing hormone receptor,GNRHR,5311128


In [44]:
df.shape

(12266, 8)

## Filter by LINCS approved Small Molecules

In [45]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:

        df.drop(index, inplace = True)
        count += 1

print(count)


7495


In [46]:
df.head()

Unnamed: 0,Old Targets,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Targets,CIDs
52,LHCGR,P22888,DB00014,Goserelin,SmallMoleculeDrug,Lutropin-choriogonadotropic hormone receptor,LHCGR,5311128
53,GNRHR,P30968,DB00014,Goserelin,SmallMoleculeDrug,Gonadotropin-releasing hormone receptor,GNRHR,5311128
346,CAMLG,P49069,DB00091,Cyclosporine,SmallMoleculeDrug,Calcium signal-modulating cyclophilin ligand,CAMLG,5284373
347,PPP3R2,Q96LZ3,DB00091,Cyclosporine,SmallMoleculeDrug,Calcineurin subunit B type 2,PPP3R2,5284373
348,PPIA,P62937,DB00091,Cyclosporine,SmallMoleculeDrug,Peptidyl-prolyl cis-trans isomerase A,PPIA,5284373


In [47]:
df.shape

(4771, 8)

## Make Binary Matrix

In [48]:
grouped_df = df.groupby(['Targets'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [49]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()

In [50]:
len(df['Targets'].unique())

1111

In [51]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [52]:
grouped_matrix.head()

Unnamed: 0_level_0,10052040,10096344,10109823,10113978,10133,10140,10152654,101616,10168,10172943,...,9913,9915743,9926791,9930049,9931954,9933475,9934347,9939609,9949641,9966051
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AADACL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCA1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
grouped_matrix.shape

(1111, 1154)

## Save Binary Matrix

In [54]:
# grouped_matrix.to_csv('Output/DrugBank_Targets.csv')

In [55]:
filename = 'Output/DrugBank_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT

In [56]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
    
dsl = np.array(drugset_library)

In [58]:
len(max(dsl, key=len))

75

In [59]:
filename = 'Output/DrugBank_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   