## Import Libraries

In [405]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugBank Data

In [733]:
df = pd.read_csv('Input/drugbank_targets.csv')

In [734]:
df.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00001,Lepirudin,BiotechDrug,P00734,Prothrombin
1,DB00002,Cetuximab,BiotechDrug,P00533,Epidermal growth factor receptor
2,DB00002,Cetuximab,BiotechDrug,O75015,Low affinity immunoglobulin gamma Fc region re...
3,DB00002,Cetuximab,BiotechDrug,P00736,Complement C1r subcomponent
4,DB00002,Cetuximab,BiotechDrug,P02745,Complement C1q subcomponent subunit A


In [735]:
df.shape

(18655, 5)

In [736]:
df['Type'].unique()

array(['BiotechDrug', 'SmallMoleculeDrug'], dtype=object)

In [737]:
len(df['Name'].unique())

7137

#### Load UniProt ID table

In [738]:
uniprot = pd.read_table('Input/gene_to_uniprot.txt')

In [739]:
uniprot.head()

Unnamed: 0,Approved Symbol,UniProt ID
0,A1BG,P04217
1,A1BG-AS1,
2,A1CF,Q9NQ94
3,A2M,P01023
4,A2M-AS1,


In [740]:
uniprot.shape

(41375, 2)

In [741]:
uniprot = uniprot[pd.notnull(uniprot['UniProt ID'])]

#### Load Targets Mapping File

In [742]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [743]:
target_update.head()

Unnamed: 0,Old Targets,Updated Targets
0,A1BG,A1BG
1,A1BG-AS1,A1BG-AS1
2,NCRNA00181,A1BG-AS1
3,A1BGAS,A1BG-AS1
4,A1BG-AS,A1BG-AS1


In [744]:
target_update.set_index('Old Targets',inplace = True)

#### Load LINCS Small Molecules

In [745]:
# with open('Input/LINCS_SmallMolecules.csv', 'rb') as f:
#     result = chardet.detect(f.read(1024**2))
#     print(result)

lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

In [746]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


In [644]:
try:
    lincs['SM_PubChem_CID'] = lincs['SM_PubChem_CID'].astype(str)
except ValueError:
    pass

In [645]:
type(lincs['SM_PubChem_CID'][1])

str

## Map UniProt ID to Gene

In [747]:
df.set_index('UniProt ID', inplace = True)
uniprot.set_index('UniProt ID',inplace=True)

In [748]:
uniprot.head()

Unnamed: 0_level_0,Approved Symbol
UniProt ID,Unnamed: 1_level_1
P04217,A1BG
Q9NQ94,A1CF
P01023,A2M
A8K2U0,A2ML1
U3KPV4,A3GALT2


In [749]:
df = pd.merge(df, uniprot, how= 'left', on = 'UniProt ID')

In [750]:
df = df.rename(index=str, columns = {'Approved Symbol':'Old Targets'})

In [751]:
df.reset_index(inplace=True)

In [752]:
df.head(4)
df.shape

(18702, 6)

In [753]:
# count = 0
# for index, row in df.iterrows():
#     if row.isnull().values.any():
#         df.drop(index, inplace = True)
#         count += 1
# print(count)

df = df.dropna(subset=['Old Targets']) 

## Update Target Names

In [754]:
df.set_index('Old Targets', inplace = True)

In [755]:
df.head()

Unnamed: 0_level_0,UniProt ID,DrugBank ID,Name,Type,UniProt Name
Old Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F2,P00734,DB00001,Lepirudin,BiotechDrug,Prothrombin
Egfr,P00533,DB00002,Cetuximab,BiotechDrug,Epidermal growth factor receptor
FCGR3B,O75015,DB00002,Cetuximab,BiotechDrug,Low affinity immunoglobulin gamma Fc region re...
C1R,P00736,DB00002,Cetuximab,BiotechDrug,Complement C1r subcomponent
C1QA,P02745,DB00002,Cetuximab,BiotechDrug,Complement C1q subcomponent subunit A


In [756]:
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')

In [757]:
df.reset_index(inplace=True)

In [758]:
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})

In [759]:
# count = 0
# for index, row in df.iterrows():
#     if row.isnull().values.any():
#         count += 1
# print(count)

df = df.dropna(subset=['Targets']) 

In [760]:
df.shape

(14343, 7)

## Get PubChemID and Map to drug name

In [761]:
namesdf = df['Name']
namesdf = namesdf.drop_duplicates()

In [663]:
nameslist = namesdf.tolist()
failed_to_get_CID = 0
CID_dict = {}

for name in nameslist:
    name = name.replace(' ','%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        name = name.replace('%20', ' ')
        CID_dict[name] = CID
    elif 'PC Compounds' in response.json().keys():
        print(response.json())
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 100 == 0:
        name = name.replace('%20',' ')
        print(nameslist.index(name))

print(failed_to_get_CID)
print(len(CID_dict))
print(len(nameslist))

    

0
1
2
3
183
288
404
518
633
737
849
955
1069
1187
1300
1420
1537
1653
1777
1895
2009
2136
2254
2372
2503
2621
2740
2860
2985
3110
3230
3348
3462
3585
3711
3828
3940
4072
4254
4448
4586
4708
4827
4943
5059
5060
5186
5302
5429
5549
5550
5551
5552
5553
5669
5793
5794
5795
5921
6032
6151
6276
6393
6508
6509
6510
6619
6744
6876
7015
1350
5787
7137


In [762]:
'Dasatinib' in CID_dict

True

In [763]:
CIDs = []
for index, row in df.iterrows():
    drugname = row.loc['Name']
    if drugname not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(float(CID_dict[drugname]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)


In [764]:
df.head()

Unnamed: 0,Old Targets,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Targets,CIDs
30,F2,P00734,DB00006,Bivalirudin,SmallMoleculeDrug,Prothrombin,F2,16129704.0
31,GNRHR,P30968,DB00007,Leuprolide,BiotechDrug,Gonadotropin-releasing hormone receptor,GNRHR,657181.0
38,GHRHR,Q02643,DB00010,Sermorelin,BiotechDrug,Growth hormone-releasing hormone receptor,GHRHR,16129620.0
52,LHCGR,P22888,DB00014,Goserelin,SmallMoleculeDrug,Lutropin-choriogonadotropic hormone receptor,LHCGR,5311128.0
53,GNRHR,P30968,DB00014,Goserelin,SmallMoleculeDrug,Gonadotropin-releasing hormone receptor,GNRHR,5311128.0


In [765]:
df.shape

(12029, 8)

## Filter by LINCS approved Small Molecules

In [766]:
3062316.0 in lincs['SM_PubChem_CID'].values

True

In [767]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        #print(float(CID))
        df.drop(index, inplace = True)
        count += 1
#     elif CID in lincs['SM_PubChem_CID'].values:
#         print('ok')
print(count)


7204


In [768]:
df.head(300)

Unnamed: 0,Old Targets,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Targets,CIDs
52,LHCGR,P22888,DB00014,Goserelin,SmallMoleculeDrug,Lutropin-choriogonadotropic hormone receptor,LHCGR,5311128.0
53,GNRHR,P30968,DB00014,Goserelin,SmallMoleculeDrug,Gonadotropin-releasing hormone receptor,GNRHR,5311128.0
346,CAMLG,P49069,DB00091,Cyclosporine,SmallMoleculeDrug,Calcium signal-modulating cyclophilin ligand,CAMLG,5284373.0
347,PPP3R2,Q96LZ3,DB00091,Cyclosporine,SmallMoleculeDrug,Calcineurin subunit B type 2,PPP3R2,5284373.0
348,PPIA,P62937,DB00091,Cyclosporine,SmallMoleculeDrug,Peptidyl-prolyl cis-trans isomerase A,PPIA,5284373.0
349,PPIF,P30405,DB00091,Cyclosporine,SmallMoleculeDrug,"Peptidyl-prolyl cis-trans isomerase F, mitocho...",PPIF,5284373.0
555,PCCB,P05166,DB00121,Biotin,SmallMoleculeDrug,"Propionyl-CoA carboxylase beta chain, mitochon...",PCCB,171548.0
556,HLCS,P50747,DB00121,Biotin,SmallMoleculeDrug,Biotin--protein ligase,HLCS,171548.0
557,SLC5A6,Q9Y289,DB00121,Biotin,SmallMoleculeDrug,Sodium-dependent multivitamin transporter,SLC5A6,171548.0
558,MCCC2,Q9HCC0,DB00121,Biotin,SmallMoleculeDrug,"Methylcrotonoyl-CoA carboxylase beta chain, mi...",MCCC2,171548.0


In [769]:
df.shape

(4825, 8)

In [770]:
for i in df['CIDs'].values:
    if i == 1051:
        print('CID is there!')
        break

In [771]:
'Dasatinib' in df['Name'].values

True

## Make Binary Matrix

In [772]:
grouped_df = df.groupby(['Targets'])['CIDs'].apply(lambda x: ',,,,,'.join(x.astype(str))).reset_index()

In [773]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()

In [774]:
len(df['Targets'].unique())

1117

In [775]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',,,,,')

In [776]:
grouped_matrix.head()

Unnamed: 0_level_0,10071196.0,10096344.0,10109823.0,10113978.0,10140.0,10152654.0,10168.0,10172943.0,10182969.0,10184653.0,...,9924495.0,9926791.0,9930049.0,9931954.0,9933475.0,9934347.0,9939609.0,9941444.0,9949641.0,9966051.0
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AADACL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCA1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [777]:
grouped_matrix.shape

(1117, 1146)

## Save Binary Matrix

In [778]:
grouped_matrix.to_csv('Output/DrugBank_Targets.csv')

In [779]:
filename = 'Output/DrugBank_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT

In [783]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    
#     for i, index in enumerate(grouped_matrix.index):
#         progressPercent = ((i+1)/len(grouped_matrix.index))*100
#         sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(grouped_matrix.index)))
#         sys.stdout.flush()
    
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
    
dsl = np.array(drugset_library)

In [784]:
print(dsl)

[list(['ABCC8', '', '121891.0', '3475.0', '3476.0', '3478.0', '3488.0', '5503.0', '5505.0', '91610.0'])
 list(['ABL1', '', '11167602.0', '24826799.0', '3062316.0', '5291.0', '5328940.0', '644241.0'])
 list(['ACE', '', '107807.0', '44093.0', '5280954.0', '5362119.0', '5362124.0', '5362129.0', '5388962.0', '5462501.0', '54892.0', '91270.0', '92400.0'])
 list(['ACHE', '', '187.0', '1935.0', '3001055.0', '3105.0', '3152.0', '3202.0', '4139.0', '4168.0', '4199.0', '4456.0', '4939.0', '5966.0', '5983.0', '6000.0', '77991.0', '854026.0', '9651.0'])
 list(['ADORA1', '', '1676.0', '2153.0', '2519.0', '3182.0', '3446.0', '3878.0', '4740.0', '5429.0', '60961.0'])
 list(['ADORA2A', '', '1676.0', '176407.0', '2153.0', '2519.0', '3182.0', '3878.0', '40692.0', '4740.0', '50942.0', '5429.0', '60961.0'])
 list(['ADRA1A', '', '115368.0', '1236.0', '129211.0', '146570.0', '208898.0', '2092.0', '2160.0', '22297.0', '2520.0', '2585.0', '2726.0', '2765.0', '2771.0', '2803.0', '2818.0', '2995.0', '3036780.0'

In [785]:
len(max(dsl, key=len))

77

In [787]:
filename = 'Output/DrugBank_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   