## Import Libraries

In [257]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugBank Data

In [212]:
df = pd.read_csv('Input/drugbank_targets.csv')

In [213]:
df.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00001,Lepirudin,BiotechDrug,P00734,Prothrombin
1,DB00002,Cetuximab,BiotechDrug,P00533,Epidermal growth factor receptor
2,DB00002,Cetuximab,BiotechDrug,O75015,Low affinity immunoglobulin gamma Fc region re...
3,DB00002,Cetuximab,BiotechDrug,P00736,Complement C1r subcomponent
4,DB00002,Cetuximab,BiotechDrug,P02745,Complement C1q subcomponent subunit A


In [214]:
df.shape

(18655, 5)

In [215]:
df['Type'].unique()

array(['BiotechDrug', 'SmallMoleculeDrug'], dtype=object)

In [216]:
len(df['Name'].unique())

7137

#### Load UniProt ID table

In [217]:
uniprot = pd.read_table('Input/gene_to_uniprot.txt')

In [218]:
uniprot.head()

Unnamed: 0,Approved Symbol,UniProt ID
0,A1BG,P04217
1,A1BG-AS1,
2,A1CF,Q9NQ94
3,A2M,P01023
4,A2M-AS1,


In [219]:
uniprot.shape

(41375, 2)

In [220]:
uniprot = uniprot[pd.notnull(uniprot['UniProt ID'])]

#### Load Targets Mapping File

In [221]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [222]:
target_update.head()

Unnamed: 0,Old Targets,Updated Targets
0,A1BG,A1BG
1,A1BG-AS1,A1BG-AS1
2,NCRNA00181,A1BG-AS1
3,A1BGAS,A1BG-AS1
4,A1BG-AS,A1BG-AS1


In [223]:
target_update.set_index('Old Targets',inplace = True)

#### Load LINCS Small Molecules

In [262]:
# with open('Input/LINCS_SmallMolecules.csv', 'rb') as f:
#     result = chardet.detect(f.read(1024**2))
#     print(result)

lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

In [263]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Map UniProt ID to Gene

In [226]:
df.set_index('UniProt ID', inplace = True)
uniprot.set_index('UniProt ID',inplace=True)

In [227]:
uniprot.head()

Unnamed: 0_level_0,Approved Symbol
UniProt ID,Unnamed: 1_level_1
P04217,A1BG
Q9NQ94,A1CF
P01023,A2M
A8K2U0,A2ML1
U3KPV4,A3GALT2


In [228]:
df = pd.merge(df, uniprot, how= 'left', on = 'UniProt ID')

In [229]:
df = df.rename(index=str, columns = {'Approved Symbol':'Old Targets'})

In [230]:
df.reset_index(inplace=True)

In [231]:
df.head(4)

Unnamed: 0,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Old Targets
0,P00734,DB00001,Lepirudin,BiotechDrug,Prothrombin,F2
1,P00533,DB00002,Cetuximab,BiotechDrug,Epidermal growth factor receptor,Egfr
2,O75015,DB00002,Cetuximab,BiotechDrug,Low affinity immunoglobulin gamma Fc region re...,FCGR3B
3,P00736,DB00002,Cetuximab,BiotechDrug,Complement C1r subcomponent,C1R


## Update Target Names

In [76]:
# df_targets = df['Targets']
# df_targets[df_targets == 'C1R'].index[0]
# #this gets the index number at every point use this to replace the values June 8 2018

In [77]:
# df_targets.head()

In [78]:
# print(type(target_update))
# few_times = 0
# for target in df_targets:
#     not_there = 0
#     not_there_lst = []
#     if target in target_update.index:
#         print(target)
#         #df.replace(to_replace = target, value = target_update.loc[target]['Updated'])
#         df.replace(to_replace = target, value = 0)

#     else:
#         not_there += 1
#         not_there_lst.append(target)

# print(not_there, not_there_lst)

# #didn't work might need to find a new way to do this replace thing....
# #doesnt look like the replace thing is going to work at all so going to try same idea as the uniprot name

In [232]:
df.set_index('Old Targets', inplace = True)

In [241]:
df.head()

Unnamed: 0,Old Targets,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Targets
0,F2,P00734,DB00001,Lepirudin,BiotechDrug,Prothrombin,F2
1,Egfr,P00533,DB00002,Cetuximab,BiotechDrug,Epidermal growth factor receptor,EGFR
2,FCGR3B,O75015,DB00002,Cetuximab,BiotechDrug,Low affinity immunoglobulin gamma Fc region re...,FCGR3B
3,C1R,P00736,DB00002,Cetuximab,BiotechDrug,Complement C1r subcomponent,C1R
4,C1QA,P02745,DB00002,Cetuximab,BiotechDrug,Complement C1q subcomponent subunit A,C1QA


In [235]:
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')

In [237]:
df.reset_index(inplace=True)

In [238]:
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})

In [245]:
count = 0
for item in df['Targets']:
    if item == 'NaN':
        count += 1
print(count)

F2
EGFR
FCGR3B
C1R
C1QA
C1QB
C1QC
FCGR3A
C1S
FCGR1A
FCGR2A
FCGR2B
FCGR2C
IL2RA
IL2RB
IL2RG
TNF
TNFRSF1B
FCGR1A
FCGR3A
FCGR2A
FCGR2B
FCGR2C
LTA
FCGR3B
C1S
C1R
C1QA
C1QB
C1QC
F2
GNRHR
IFNAR2
IFNAR1
PLG
FGA
PLAUR
SERPINE1
GHRHR
IFNAR2
IFNAR1
EPOR
PLG
PLAUR
PLAU
PLAT
SERPINE1
SERPINB2
SERPINA5
LRP2
ST14
NID1
LHCGR
GNRHR
PLG
FGA
PLAUR
SERPINE1
EPOR
CALCR
IFNAR1
IFNAR2
CSF3R
ELANE
CSF2RA
IL3RA
CSF2RB
SDC2
PRG2
SCTR
IFNAR1
IFNAR2
TSHR
F10
F9
VWF
PHYH
ASGR2
HSPA5
CALR
CANX
LMAN1
LRP1
MCFD2
IL1R1
FCGR1A
FCGR1B
FCGR2A
FCGR2B
FCGR2C
FCGR3A
FCGR3B
C3
C4A
C4B
C4B_2
C5
PLG
FGA
PLAUR
SERPINE1
INSR
IGF1R
RB1
CTSD
IDE
PCSK2
CPE
PCSK1
NOV
LRP2
IGFBP7
SYTL4
PLG
FGA
PLAUR
SERPINE1
SERPINB2
CLEC3B
KRT8
ANXA2
CALR
CANX
LRP1
FSHR
LHCGR
IFNGR1
IFNGR2
IFNAR1
IFNAR2
AVPR2
AVPR1A
AVPR1B
F10
HPN
TFPI
GGCX
F7
CNTN1
IL11RA
FGFR2
NRP1
FGFR1
FGFR4
FGFR3
HSPG2
GCGR
GLP2R
GLP1R
IL2RB
IL2RA
IL2RG
VAMP2
VAMP1
SYT2
FCER1A
MS4A2
LHCGR
TLR2
INSR
IGF1R
INSR
IGF1R
COL1A1
COL2A1
COL3A1
COL1A2
GNRHR
LHCGR
TNF
FCGR3B
C1R
C1QA
C1

NCF2
NCF4
RAC1
RAC2
SLC6A2
MPG
A2M
TF
ATOX1
CHRM1
CHRM2
CHRM3
XK
TUBA1A
TUBB4B
XK
ACE
XK
ADRB2
ADRB1
HPN
RARA
RXRA
RARB
RXRB
RARG
RXRG
IGFBP3
PSG5
CYP26C1
SLC12A3
XK
SCN10A
SCN5A
CALM1
CACNG1
XK
XK
EGFR
NR1I2
NR1I2
SCN5A
NR1I2
PTGS2
ELN
SLC12A1
ALDH5A1
XK
XK
MPO
XK
GAMT
RNASE1
DLG4
XK
ALDH2
XK
XK
XK
TOP2A
KCNH2
XK
XK
XK
XK
XK
XK
XK
ESR1
SHBG
SLC6A2
SLC6A4
HTR2A
HTR1A
DHX8
ADRA1A
ADRA1D
CHRM1
CHRM2
CHRM3
CHRM4
CHRM5
HTR2C
HTR6
ADRA1B
ADRA2A
ADRA2B
ADRA2C
ADRB1
ADRB2
ADRB3
DRD2
PGRMC1
SIGMAR1
XK
TUBB
TUBA4A
ACE
SLC6A2
SLC6A4
DRD2
DRD1
ADRA2A
ADRA1A
CHRM1
GABRA1
HTR2A
HTR2C
HTR6
HTR7
DRD3
DRD4
DHX8
ADRA1A
ADRA1B
ADRA1D
CHRM1
CHRM2
CHRM3
CHRM4
CHRM5
HTR2B
HTR3A
HTR1A
HTR1B
ADRA2A
ADRA2B
ADRA2C
HRH4
GABRA1
GABRA2
GABRA3
GABRA4
GABRA5
GABRA6
GABRB1
GABRB2
GABRB3
GABRD
GABRE
GABRG1
GABRG2
GABRG3
GABRP
GABRQ
SLC6A3
TYMS
ACHE
BCHE
ALB
GABRA1
GABRA2
GABRA3
GABRA5
GABRG1
GABRG2
GABRG3
GABRB1
GABRB2
GABRB3
GABRD
GABRE
GABRP
GABRR1
GABRR2
GABRR3
NR3C1
XK
AKR1D1
SRD5A2
TYR
XK
CYSLTR1
THPO
XK
MMP12
A

ADRB2
VEGFA
NPPB
GJA1
KCNH2
VCAM1
ADRA1D
ADRA1B
ADRA2C
ADRA2B
ADRA2A
SELE
HIF1A
KCNJ4
XK
XK
TOP2A
ABCC2
ABCC1
SLC22A12
NR1I2
XK
XK
XK
XK
XK
XK
DHX8
DHX15
SLC6A2
SLC6A4
HTR2A
HTR2B
HTR2C
CHRM1
CHRM2
CHRM3
CHRM4
CHRM5
ADRA1A
ADRA1B
ADRA1D
HTR1A
ADRA2A
ADRA2B
ADRA2C
DRD2
HTR6
HRH4
KCNH2
nan
ENPP1
CA1
CA2
CA4
CA7
CA3
XK
DHX8
SLC6A3
XK
XK
XK
XK
XK
XK
CHRM2
CHRM1
HTR2A
HTR2C
SLC6A4
HTR1A
SLC6A2
SLC6A3
ADRA1B
ADRA2A
ADRA1A
KCNH2
XK
XK
XK
SLC6A2
SLC6A4
HTR2A
ADRB2
ADRB1
SMPD1
DHX8
ADRA1A
ADRA1B
ADRA1D
CHRM1
CHRM2
CHRM3
CHRM4
CHRM5
HTR1A
HTR2C
DRD2
ADRA2A
ADRA2B
ADRA2C
XK
GABRA1
KCNJ8
KCNJ11
XK
XK
SLC6A3
SLC6A2
CHRNA3
DHFR
ATP1A1
KCNK3
KCNK9
KCNMA1
GRIN3A
GRIN3B
GRIN2A
GLRA1
RHO
KCNJ6
KCNJ3
MT-ND1
KCNN4
nan
ATP2C1
GABRA1
GNG2
NPSR1
GABRA1
GABRA2
GABRA3
GABRA4
GABRA5
GABRA6
GABRB1
GABRB2
GABRB3
GABRD
GABRE
GABRG1
GABRG2
GABRG3
GABRP
GABRQ
PTGFR
PTGIR
SCN10A
GRIN3A
CHRNA10
HTR3A
SLC6A3
ADRA1A
ADRA1B
ADRA1D
KCNH2
KCNH6
KCNH7
XK
XK
XK
XK
XK
XK
S100A13
XK
XK
TOP2A
PDE3A
CYP51A1
XK
MAOA
MAOB
IKBKB
TX

XK
XK
XK
XK
XK
XK
XK
XK
PYGL
SFTPD
IFNB1
XK
XK
XK
ARG1
XK
XK
AKR1B1
AKR1A1
AKR1B10
XK
LAP3
XK
MAPK10
PNP
PFKFB1
XK
NQO1
MB
PLAU
XK
NQO1
XK
LDHB
XK
DHFR2
XK
ACHE
PAEP
XK
CDK2
CCNA2
XK
XK
XK
XK
XK
XK
XK
XK
XK
UBE2D2
PPIA
PTPN1
XK
MAN1B1
HSP90AB1
HSP90AA1
HSP90B1
XK
SLC25A4
XK
DHFR
XK
CA2
XK
XK
XK
XK
XK
XK
XK
UCK2
XK
SRC
XK
XK
XK
XK
PRSS1
PTPN1
XK
XK
XK
XK
XK
XK
XK
GNPDA1
XK
XK
XK
PYGM
PLA2G2A
PLA2G1B
PLA2G2E
MMP3
XK
XK
XK
XK
XK
XK
TK2
XK
XK
TK1
XK
XK
XK
XK
LIPF
GSTM1
GSTM2
CFB
XK
XK
NANOS2
PRSS1
PRSS2
PRSS1
FA2H
RARG
XK
RAB5A
XK
XK
XK
XK
HMOX1
XK
XK
LCT
PYGM
LCTL
ADA
PLAU
XK
XK
XK
CA2
XK
ADH1B
RHO
PRKACA
PRKACB
CDK7
XK
S100A11
MAPK12
PRKCQ
AURKA
MAPK1
LDHA
BST1
XK
CMAS
GSTA1
XK
XK
CKM
FGFR2
MAN2A1
XK
XK
RPL10L
RPL13A
RPL23
RPL15
RPL19
RPL23A
RSL24D1
RPL26L1
RPL8
RPL37
RPL3
RPL11
SNU13
CPA1
XK
XK
XK
ANXA5
PARP1
ARG1
XK
XK
XK
PLA2G2A
XK
PPP1CA
PPP2R1A
PPP2CA
PPP2R5C
PPP2R2A
CYP2C9
XK
FDPS
XK
XK
MAOB
NR1H4
EPRS
XK
ALDOA
ALDOB
XK
PFKFB1
ALDOB
XK
XK
XK
XK
AKR1B1
PYGM
XK
XK
XK
XK
XK
PRSS1
PLAU

ESR2
XK
HAO1
XK
XK
XK
XK
SPR
PTS
DHFR
XK
XK
HAGH
CELA1
CTSK
NANOS1
GAPDH
MAOB
XK
PAPOLA
XK
XK
XK
HSP90AA1
RNASE1
TPI1
METAP2
XK
CALM1
XK
SRC
ACTA1
ARG1
CA2
XK
CTNNB1
XK
XK
XK
HMOX1
COMT
XK
XK
XK
XK
XK
XK
INSR
ATP2A1
PDXK
PGK1
XK
XK
XK
XK
KIF1A
NOS3
XK
XK
XK
XK
XK
XK
XK
NOS3
XK
TGFBR1
COMTD1
CSNK2A1
ELANE
CELA1
XK
CTH
GLRA1
GPRIN1
SERPINB3
XK
XK
ITGAL
XK
CYCS
FGF2
ANXA5
XK
XK
GPI
XK
XK
XK
XK
XK
LDHA
LDHB
XK
XK
XK
XK
XK
TNNC1
CRP
ELSPBP1
XK
XK
IMPDH1
IMPDH2
XK
CA2
GNPDA1
XK
XK
NANOS2
MAN2A1
PLCD1
IL2
DCPS
XK
FGF2
FGF1
HS3ST3A1
HGF
XK
ANXA5
XK
PGD
TLR2
XK
XK
NOS3
COPG1
XK
XRCC4
XK
ACO2
XK
LYZ
XK
KCTD11
XK
XK
XK
XK
XK
AMY2A
XK
NOS3
XK
CA2
XK
PRSS1
PRSS2
CTRB1
CYCS
CALM1
XK
XK
YARS
XK
XK
MAPK14
ANXA5
FGF2
HS3ST3A1
PTPN1
SUOX
CTSS
XK
XK
DHFR
ADCYAP1
XK
XK
XK
XK
XK
NEU2
XK
XK
ANXA3
XK
XK
XK
KIF11
XK
XK
XK
GRIA2
PTPN1
CA2
LCK
XK
UCK2
XK
XK
CDK2
XK
PRSS1
CYB5A
PARP1
FKBP1A
PYGM
CDK5
CDK1
GSK3B
XK
XK
CTSG
CMA1
MAOA
NOS3
ESR2
ACHE
XK
XK
XK
XK
CKM
XK
XK
XK
XK
XK
XK
XK
XK
XK
XK
XK
XK
IVD
XK
XK
XK
X

HTR2A
HTR2B
HTR2C
ADRA2A
ADRA2B
ADRA2C
PGR
CD86
CD80
HTR1A
ACPP
F12
DHX8
PRSS1
CTRB1
PLG
KLK1
HMGCR
ADRA1A
ADRA2A
ADRA2B
ADRA1B
ADRA1D
ADRA2C
F2
XK
DHX8
HRH3
GNRHR
SLC6A4
SLC6A2
SLC6A3
SLC6A2
SLC6A4
CHRM3
CHRM4
CHRM1
CHRM2
CHRM5
ADRA1A
SLC18A2
ADRA2A
ADRA2B
ADRA2C
CHRM3
AKR1B1
ESR1
ADRA1A
ADRA2A
CACNA1C
CACNA2D1
CACNB2
CACNA1D
CACNA1S
CACNA2D3
PGR
ALB
AKR1B1
SLC18A2
TAAR1
GABRB2
GABRB3
LHCGR
GNRHR
GBA
PTGS1
PTGS2
ADRB1
ADRB2
ADRB3
KLK1
ALB
XK
ADGRD1
ESR1
ESRRA
NR2F1
AHR
CYP1B1
ATP6V1A
ATP6V1A
PTGS2
PTGS1
OPRM1
OPRK1
OPRD1
GPRIN1
GRIN2A
GRIN2B
GRIN2C
GRIN2D
GRIN3A
GRIN3B
TBXA2R
GPRIN1
GRIN2A
GRIN2B
GRIN2C
GRIN2D
GRIN3A
GRIN3B
SLCO1B3
SLCO1B3
PDE4A
CACNA1C
BCL2
IDH3A
IDH3G
TAB1
ARG1
TF
XK
XK
ADRA1A
DHX8
XK
XK
XK
XK
TUBA4A
TUBB1
ACTN1
TRPV1
PHB2
CPS1
NR1H4
NR1I2
GPBAR1
AKR1C2
SERPINC1
VEGFA
TFPI
SELP
NR3C2
NR3C1
APP
GNRHR
SMO
CHRM1
CHRM2
CHRM3
CHRM4
GNRHR
PGR
SSTR2
SSTR5
CA6
OPRM1
OPRK1
PTGS1
PTGS2
CXCR4
DHFR
TYMS
ADRB2
XK
XK
XK
XK
SERPINC1
ITGA4
CXCL12
GNRHR
LTA4H
PTPN1
GGPS1
PIK3CG
PIK3

PDCD1
IL6
PDCD1
SLC5A2
UGCG
XK
XK
GLP1R
GLP1R
LEPR
XK
XK
TOP2A
TACR1
OPRM1
XK
XK
CD19
CD3D
BTK
XK
XK
XK
XK
XK
XK
XK
XK
XK
XK
XK
CNR1
CNR2
GPR12
GLRA1
GLRA1
GLRB
GLRA3
GPR18
GPR55
HTR1A
HTR2A
CHRNA7
OPRD1
OPRM1
PPARG
TRPV1
CACNA1G
CACNA1H
CACNA1I
TRPA1
TRPM8
TRPV2
TRPV3
TRPV4
VDAC1
ALK
PPARA
CRHR1
SLC6A4
HTR3A
HTR7
HTR1B
HTR1A
ADRB1
ACAA1
ESR1
MTNR1A
MTNR1B
CDK4
CDK6
PARP1
PARP2
PARP3
F10
CHRM1
CHRM2
CHRM3
CHRM4
CHRM5
FLT1
KDR
FLT4
FGFR1
FGFR2
FGFR3
FGFR4
KIT
FLT1
KDR
FLT4
FGFR1
FGFR2
FGFR3
FLT3
LCK
LYN
SRC
ADRB2
ADRB2
RYR1
RYR2
ESR1
ESR2
AKR1B1
TRPV3
OPRM1
CACNA1C
CACNA1D
CACNA1F
CACNA1S
CACNB1
CACNB2
CACNB3
CACNB4
KCNMA1
KCNMB1
KCNMB2
KCNMB3
KCNMB4
KCNN4
KCNN1
KCNN2
KCNN3
CACNA1G
CACNA1H
CACNA1I
CACNA1S
NR3C1
HDAC2
RPL3
NNT
GAPDH
IDH3A
OGDH
MDH2
XK
XK
XK
XK
XK
ANXA3
NR3C1
DRD2
DRD1
DRD5
GHR
IGF1R
XK
KIT
S1PR1
EPOR
VWF
F9
F10
F2
HBA1
HBA2
HBB
MB
ALDH3B2
GABRA1
GABRA2
GABRA3
GABRA4
GABRA5
GABRA6
GABRB1
GABRB2
GABRB3
GABRD
GABRE
GABRG1
GABRG2
GABRG3
GABRP
GABRQ
LDHA
LDHB
P2RX4
TRPV1
ADCY

## Get PubChemID and Map to drug name

In [186]:
namesdf = df['Name']
namesdf = namesdf.drop_duplicates()

In [198]:
print(namesdf)

0                                                Lepirudin
1                                                Cetuximab
13                                     Denileukin diftitox
16                                              Etanercept
30                                             Bivalirudin
31                                              Leuprolide
32                                   Peginterferon alfa-2a
34                                               Alteplase
38                                              Sermorelin
39                                      Interferon alfa-n1
41                                        Darbepoetin alfa
42                                               Urokinase
52                                               Goserelin
54                                               Reteplase
58                                          Erythropoietin
59                                       Salmon Calcitonin
60                                      Interferon alfa-

In [203]:
nameslist = namesdf.tolist()
failed_to_get_CID = 0
CID_dict = {}

for name in nameslist:
    name = name.replace(' ','%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        name = name.replace('%20', ' ')
        CID_dict[name] = CID
    elif 'PC Compounds' in response.json().keys():
        print(response.json())
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 100 == 0:
        print('this is working I promise')
        name = name.replace('%20',' ')
        print(nameslist.index(name))

print(failed_to_get_CID)
print(len(CID_dict))

    

this is working I promise
0
this is working I promise
1
this is working I promise
2
this is working I promise
3
this is working I promise
189
this is working I promise
302
this is working I promise
407
this is working I promise
519
this is working I promise
636
this is working I promise
745
this is working I promise
849
this is working I promise
952
this is working I promise
1060
this is working I promise
1061
this is working I promise
1179
this is working I promise
1296
this is working I promise
1400
this is working I promise
1522
this is working I promise
1638
this is working I promise
1750
this is working I promise
1874
this is working I promise
1995
this is working I promise
2112
this is working I promise
2227
this is working I promise
2339
this is working I promise
2460
this is working I promise
2575
this is working I promise
2700
this is working I promise
2811
this is working I promise
2933
this is working I promise
3054
this is working I promise
3166
this is working I promise
32

TypeError: unhashable type: 'slice'

In [204]:
print(CID_dict)

{'Bivalirudin': 16129704, 'Leuprolide': 657181, 'Sermorelin': 16129620, 'Goserelin': 5311128, 'Erythropoietin': 11751549, 'Salmon Calcitonin': 16129616, 'Glucagon recombinant': 44278361, 'Insulin Glargine': 118984454, 'Cetrorelix': 25074887, 'Human Serum Albumin': 72941834, 'Eptifibatide': 448812, 'Insulin Pork': 118984380, 'Ibritumomab tiuxetan': 74890578, 'Streptokinase': 9815560, 'Cyclosporine': 5284373, 'Octreotide': 448601, 'Abarelix': 16131215, 'Oxytocin': 439302, 'Pyridoxal Phosphate': 1051, 'Cyanocobalamin': 5311498, 'Tetrahydrofolic acid': 91443, 'Histidine': 6274, 'Ademetionine': 34755, 'Pyruvic acid': 1060, 'L-Phenylalanine': 6140, 'Choline': 305, 'L-Lysine': 5962, 'L-Arginine': 6322, 'Vitamin C': 54670067, 'Spermine': 1103, 'L-Aspartic Acid': 5960, 'Ornithine': 6262, 'Adenosine monophosphate': 6083, 'Alpha-Linolenic Acid': 5280934, 'Serine': 5951, 'L-Tyrosine': 6057, 'Calcitriol': 5280453, 'Cystine': 595, 'Succinic acid': 1110, 'Riboflavin': 493570, 'N-Acetylglucosamine': 2

In [249]:
CIDs = []
for index, row in df.iterrows():
    drugname = row.loc['Name']
    if drugname not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(CID_dict[drugname])

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

In [254]:
df.head(20)

Unnamed: 0,Old Targets,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Targets,CIDs
30,F2,P00734,DB00006,Bivalirudin,SmallMoleculeDrug,Prothrombin,F2,16129704
31,GNRHR,P30968,DB00007,Leuprolide,BiotechDrug,Gonadotropin-releasing hormone receptor,GNRHR,657181
38,GHRHR,Q02643,DB00010,Sermorelin,BiotechDrug,Growth hormone-releasing hormone receptor,GHRHR,16129620
52,LHCGR,P22888,DB00014,Goserelin,SmallMoleculeDrug,Lutropin-choriogonadotropic hormone receptor,LHCGR,5311128
53,GNRHR,P30968,DB00014,Goserelin,SmallMoleculeDrug,Gonadotropin-releasing hormone receptor,GNRHR,5311128
58,EPOR,P19235,DB00016,Erythropoietin,BiotechDrug,Erythropoietin receptor,EPOR,11751549
59,CALCR,P30988,DB00017,Salmon Calcitonin,BiotechDrug,Calcitonin receptor,CALCR,16129616
146,GCGR,P47871,DB00040,Glucagon recombinant,BiotechDrug,Glucagon receptor,GCGR,44278361
147,GLP2R,O95838,DB00040,Glucagon recombinant,BiotechDrug,Glucagon-like peptide 2 receptor,GLP2R,44278361
148,GLP1R,P43220,DB00040,Glucagon recombinant,BiotechDrug,Glucagon-like peptide 1 receptor,GLP1R,44278361


## Filter by LINCS approved Small Molecules

## Make Binary Matrix

In [118]:
grouped_df = df.groupby(['Targets'])['Name'].apply(lambda x: ',,,,,'.join(x.astype(str))).reset_index()

In [119]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()

In [120]:
len(df['Targets'].unique())

2598

In [121]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',,,,,')

In [122]:
grouped_matrix.head()

Unnamed: 0_level_0,'5'-O-(N-(L-Alanyl)-Sulfamoyl)Adenosine,'5'-O-(N-(L-Prolyl)-Sulfamoyl)Adenosine,"((2r,3s,5r)-3-Hydroxy-5-(4-Hydroxy-2-Oxo-3,4-Dihydropyrimidin-1(2h)-Yl)-Tetrahydrofuran-2-Yl)Methyldihydrogen Phosphate","(1'r,2's)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-Yl)Adenine","(1,10 Phenanthroline)-(Tri-Carbon Monoxide) Rhenium (I)",(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)acetic acid,"(1-HYDROXY-1-PHOSPHONO-2-[1,1';3',1'']TERPHENYL-3-YL-ETHYL)-PHOSPHONIC ACID","(1-HYDROXY-1-PHOSPHONO-2-[1,1';4',1'']TERPHENYL-3-YL-ETHYL)-PHOSPHONIC ACID","(1-HYDROXYDODECANE-1,1-DIYL)BIS(PHOSPHONIC ACID)","(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",...,"{4-[2,2-BIS(5-METHYL-1,2,4-OXADIAZOL-3-YL)-3-PHENYLPROPYL]PHENYL}SULFAMIC ACID","{4-[2-Acetylamino-2-(3-Carbamoyl-2-Cyclohexylmethoxy-6,7,8,9-Tetrahydro-5h-Benzocyclohepten-5ylcarbamoyl)-Ethyl]-2-Phosphono-Phenyl}-Phosphonic Acid",{4-[2-BENZYL-3-METHOXY-2-(METHOXYCARBONYL)-3-OXOPROPYL]PHENYL}SULFAMIC ACID,{4-[3-(4-acetyl-3-hydroxy-2-propylphenoxy)propoxy]phenoxy}acetic acid,"{4-[3-(6,7-Diethoxy-Quinazolin-4-Ylamino)-Phenyl]-Thiazol-2-Yl}-Methanol","{[(2,2-Dihydroxy-Ethyl)-(2,3,4,5-Tetrahydroxy-6-Phosphonooxy-Hexyl)-Amino]-Methyl}-Phosphonic Acid","{[(2,6-difluorophenyl)carbonyl]amino}-N-(4-fluorophenyl)-1H-pyrazole-3-carboxamide","{[2-(1h-1,2,3-Benzotriazol-1-Yl)-2-(3,4-Difluorophenyl)Propane-1,3-Diyl]Bis[4,1-Phenylene(Difluoromethylene)]}Bis(Phosphonic Acid)","{[5-(5-nitro-2-furyl)-1,3,4-oxadiazol-2-yl]thio}acetic acid",{[7-(Difluoro-Phosphono-Methyl)-Naphthalen-2-Yl]-Difluoro-Methyl}-Phosphonic Acid
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AADACL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AADAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AANAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
grouped_matrix.shape

(2610, 5365)

## Save Binary Matrix

In [70]:
grouped_matrix.to_csv('Output/DrugBank_Targets.csv')

In [71]:
filename = 'Output/DrugBank_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT

In [24]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    
#     for i, index in enumerate(grouped_matrix.index):
#         progressPercent = ((i+1)/len(grouped_matrix.index))*100
#         sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(grouped_matrix.index)))
#         sys.stdout.flush()
    
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    drugset_library.append(drugset)
    
dsl = np.array(drugset_library)

In [25]:
print(dsl)

[list(['A1BG', '', 'Copper', 'Zinc'])
 list(['A2M', '', 'Bacitracin', 'Becaplermin', 'Cisplatin', 'Ocriplasmin', 'Zinc'])
 list(['AADACL2', '', 'GIBBERELLIN A3', 'GIBBERELLIN A4']) ...
 list(['ZAP70', '', 'Staurosporine'])
 list(['ZFY', '', 'Beta-Cyclohexyl-Alanine', 'PCL-016'])
 list(['ZYX', '', 'Artenimol'])]


In [28]:
len(max(dsl, key=len))

139

In [26]:
filename = 'Output/DrugBank_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   