## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os

## Load Data

#### Load DrugBank Data

In [2]:
df = pd.read_csv('Input/drugbank_targets.csv')

In [3]:
df.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00001,Lepirudin,BiotechDrug,P00734,Prothrombin
1,DB00002,Cetuximab,BiotechDrug,P00533,Epidermal growth factor receptor
2,DB00002,Cetuximab,BiotechDrug,O75015,Low affinity immunoglobulin gamma Fc region re...
3,DB00002,Cetuximab,BiotechDrug,P00736,Complement C1r subcomponent
4,DB00002,Cetuximab,BiotechDrug,P02745,Complement C1q subcomponent subunit A


In [4]:
df.shape

(18655, 5)

In [5]:
df['Type'].unique()

array(['BiotechDrug', 'SmallMoleculeDrug'], dtype=object)

In [6]:
len(df['Name'].unique())

7137

#### Load UniProt ID table

In [22]:
uniprot = pd.read_table('Input/gene_to_uniprot.txt')

In [23]:
uniprot.head()

Unnamed: 0,Approved Symbol,UniProt ID
0,A1BG,P04217
1,A1BG-AS1,
2,A1CF,Q9NQ94
3,A2M,P01023
4,A2M-AS1,


In [24]:
uniprot.shape

(41375, 2)

In [25]:
uniprot = uniprot[pd.notnull(uniprot['UniProt ID'])]

#### Load Targets Mapping File

In [48]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [52]:
target_update.head()

Unnamed: 0_level_0,Updated
Targets,Unnamed: 1_level_1
A1BG,A1BG
A1BG-AS1,A1BG-AS1
NCRNA00181,A1BG-AS1
A1BGAS,A1BG-AS1
A1BG-AS,A1BG-AS1


In [51]:
target_update.set_index('Old Targets',inplace = True)

## Map UniProt ID to Gene

In [33]:
df.set_index('UniProt ID', inplace = True)
uniprot.set_index('UniProt ID',inplace=True)

In [34]:
uniprot.head()

Unnamed: 0_level_0,Approved Symbol
UniProt ID,Unnamed: 1_level_1
P04217,A1BG
Q9NQ94,A1CF
P01023,A2M
A8K2U0,A2ML1
U3KPV4,A3GALT2


In [35]:
df = pd.merge(df, uniprot, how= 'left', on = 'UniProt ID')

In [36]:
df = df.rename(index=str, columns = {'Approved Symbol':'Old Targets'})

In [37]:
df.reset_index(inplace=True)

In [43]:
df.head(4)

Unnamed: 0,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Targets
0,P00734,DB00001,Lepirudin,BiotechDrug,Prothrombin,F2
1,P00533,DB00002,Cetuximab,BiotechDrug,Epidermal growth factor receptor,Egfr
2,O75015,DB00002,Cetuximab,BiotechDrug,Low affinity immunoglobulin gamma Fc region re...,FCGR3B
3,P00736,DB00002,Cetuximab,BiotechDrug,Complement C1r subcomponent,C1R


## Update Target Names

In [40]:
df_targets = df['Targets']
df_targets[df_targets == 'C1R'].index[0]
#this gets the index number at every point use this to replace the values June 8 2018

3

In [156]:
df_targets.head()

0        F2
1      Egfr
2    FCGR3B
3       C1R
4      C1QA
Name: Targets, dtype: object

In [149]:
print(type(target_update))
few_times = 0
for target in df_targets:
    not_there = 0
    not_there_lst = []
    if target in target_update.index:
        print(target)
        #df.replace(to_replace = target, value = target_update.loc[target]['Updated'])
        df.replace(to_replace = target, value = 0)

    else:
        not_there += 1
        not_there_lst.append(target)

print(not_there, not_there_lst)

#didn't work might need to find a new way to do this replace thing....
#doesnt look like the replace thing is going to work at all so going to try same idea as the uniprot name

<class 'pandas.core.frame.DataFrame'>
F2
Egfr
FCGR3B
C1R
C1QA
C1QB
C1QC
FCGR3A
C1S
FCGR1A
FCGR2A
FCGR2B
FCGR2C
IL2RA
IL2RB
IL2RG
TNF
TNFRSF1B
FCGR1A
FCGR3A
FCGR2A
FCGR2B
FCGR2C
LTA
FCGR3B
C1S
C1R
C1QA
C1QB
C1QC
F2
GNRHR
IFNAR2
IFNAR1
PLG
FGA
PLAUR
SERPINE1
GHRHR
IFNAR2
IFNAR1
EPOR
PLG
PLAUR
PLAU
PLAT
SERPINE1
SERPINB2
SERPINA5
LRP2
ST14
NID1
LHCGR
GNRHR
PLG
FGA
PLAUR
SERPINE1
EPOR
CALCR
IFNAR1
IFNAR2
CSF3R
ELANE
CSF2RA
IL3RA
CSF2RB
SDC2
PRG2
SCTR
IFNAR1
IFNAR2
TSHR
F10
F9
VWF
PHYH
ASGR2
HSPA5
CALR
CANX
LMAN1
LRP1
MCFD2
IL1R1
FCGR1A
FCGR1B
FCGR2A
FCGR2B
FCGR2C
FCGR3A
FCGR3B
C3
C4A
C4B
C4B_2
C5
PLG
FGA
PLAUR
SERPINE1
INSR
IGF1R
RB1
CTSD
IDE
PCSK2
CPE
PCSK1
NOV
LRP2
IGFBP7
SYTL4
PLG
FGA
PLAUR
SERPINE1
SERPINB2
CLEC3B
KRT8
ANXA2
CALR
CANX
LRP1
FSHR
LHCGR
IFNGR1
IFNGR2
IFNAR1
IFNAR2
AVPR2
AVPR1A
AVPR1B
F10
HPN
TFPI
GGCX
F7
F3
IL11RA
FGFR2
NRP1
FGFR1
FGFR4
FGFR3
HSPG2
GCGR
GLP2R
GLP1R
IL2RB
IL2RA
IL2RG
VAMP2
VAMP1
SYT2
FCER1A
MS4A2
LHCGR
TLR2
INSR
IGF1R
INSR
IGF1R
COL1A1
COL2A1
COL3A1
COL1A2

TCN1
AMN
CUBN
MMAB
MMACHC
ADORA1
ADORA2A
PDE4B
RYR1
PDE1A
PDE1B
PDE1C
PDE10A
PDE4A
PDE4B
PDE4C
PDE4D
PDE7B
PDE2A
PDE3A
PDE3B
PDE5A
PDE6C
PDE11A
PDE7A
PDE8A
PDE8B
PDE9A
PDE6A
PDE6B
PRKDC
PIK3CD
PIK3CA
PIK3CB
ITPR1
ITPR2
ITPR3
ATM
CHRNA10
nan
CHRM1
CHRM2
CHRM3
PDE5A
PDE6G
PDE6H
KCNH2
KCNK2
KCNJ12
DHFR
nan
HEXB
SLC18A2
SLC18A1
BIRC5
nan
nan
PADI4
P2RY12
CHRM1
RARG
RARB
RXRG
RXRB
RARA
RXRA
ADRA1A
ADRA1B
ADRA1D
REN
ATP4A
SLC12A1
SLC6A4
HRH1
ADRA1A
CHRM1
HTR2C
HTR1D
HTR1B
HTR1F
HTR1A
HTR1E
HTR2B
HTR7
ADRA2A
ADRA2B
ADRA2C
KCNJ1
ADRB1
ADRB2
ADRB3
nan
nan
TOP2A
CHRM1
nan
ADRB1
ADRB2
KCNJ11
KCNJ1
ABCC8
NR3C1
nan
SLC6A2
HMGCR
ITGAL
HDAC2
GABRA1
GABRA2
GABRA3
GABRA4
GABRA5
GABRA6
GABRB1
GABRB2
GABRB3
GABRD
GABRE
GABRG1
GABRG2
GABRG3
GABRP
GABRQ
GLRA1
GLRB
KCNN4
KCNA1
ATP2C1
GRIA1
nan
CACNA1A
GABRA1
GABRA2
GABRA3
GABRA4
GABRA5
GABRA6
GABRB1
GABRB2
GABRB3
GABRG1
GABRG2
GABRG3
GABRD
GABRE
GABRP
GABRR1
GABRR2
GABRR3
GABRQ
nan
GABRA1
GABRA2
GABRA3
GABRA4
GABRA5
GABRA6
GABRB1
GABRB2
GABRB3
GABRD
GABRE
G

KeyboardInterrupt: 

In [46]:
df.set_index('Targets', inplace = True)

In [56]:
df.head()

Unnamed: 0,Targets,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Updated
0,F2,P00734,DB00001,Lepirudin,BiotechDrug,Prothrombin,F2
1,Egfr,P00533,DB00002,Cetuximab,BiotechDrug,Epidermal growth factor receptor,EGFR
2,FCGR3B,O75015,DB00002,Cetuximab,BiotechDrug,Low affinity immunoglobulin gamma Fc region re...,FCGR3B
3,C1R,P00736,DB00002,Cetuximab,BiotechDrug,Complement C1r subcomponent,C1R
4,C1QA,P02745,DB00002,Cetuximab,BiotechDrug,Complement C1q subcomponent subunit A,C1QA


In [53]:
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')

In [55]:
df.reset_index(inplace=True)

## Make Binary Matrix

In [18]:
grouped_df = df.groupby(['Targets'])['Name'].apply(lambda x: ',,,,,'.join(x.astype(str))).reset_index()

In [19]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()

In [20]:
len(df['Targets'].unique())

2611

In [21]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',,,,,')

In [22]:
grouped_matrix.head()

Unnamed: 0_level_0,'5'-O-(N-(L-Alanyl)-Sulfamoyl)Adenosine,'5'-O-(N-(L-Prolyl)-Sulfamoyl)Adenosine,"(1'r,2's)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-Yl)Adenine",(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)acetic acid,"(1-HYDROXYDODECANE-1,1-DIYL)BIS(PHOSPHONIC ACID)","(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)","(1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",(1-Methyl-1h-Imidazol-2-Yl)-(3-Methyl-4-{3-[(Pyridin-3-Ylmethyl)-Amino]-Propoxy}-Benzofuran-2-Yl)-Methanone,(1-Tert-Butyl-5-Hydroxy-1h-Pyrazol-4-Yl)-(6-Methanesulfonyl-4'-Methoxy-2-Methyl-Biphenyl-3-Yl)-Methanone,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYANDROST-4-EN-3-ONE",...,"{4-[(CARBOXYMETHOXY)CARBONYL]-3,3-DIOXIDO-1-OXONAPHTHO[1,2-D]ISOTHIAZOL-2(1H)-YL}ACETIC ACID","{4-[2,2-BIS(5-METHYL-1,2,4-OXADIAZOL-3-YL)-3-PHENYLPROPYL]PHENYL}SULFAMIC ACID","{4-[2-Acetylamino-2-(3-Carbamoyl-2-Cyclohexylmethoxy-6,7,8,9-Tetrahydro-5h-Benzocyclohepten-5ylcarbamoyl)-Ethyl]-2-Phosphono-Phenyl}-Phosphonic Acid",{4-[2-BENZYL-3-METHOXY-2-(METHOXYCARBONYL)-3-OXOPROPYL]PHENYL}SULFAMIC ACID,{4-[3-(4-acetyl-3-hydroxy-2-propylphenoxy)propoxy]phenoxy}acetic acid,"{4-[3-(6,7-Diethoxy-Quinazolin-4-Ylamino)-Phenyl]-Thiazol-2-Yl}-Methanol","{[(2,6-difluorophenyl)carbonyl]amino}-N-(4-fluorophenyl)-1H-pyrazole-3-carboxamide","{[2-(1h-1,2,3-Benzotriazol-1-Yl)-2-(3,4-Difluorophenyl)Propane-1,3-Diyl]Bis[4,1-Phenylene(Difluoromethylene)]}Bis(Phosphonic Acid)","{[5-(5-nitro-2-furyl)-1,3,4-oxadiazol-2-yl]thio}acetic acid",{[7-(Difluoro-Phosphono-Methyl)-Naphthalen-2-Yl]-Difluoro-Methyl}-Phosphonic Acid
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AADACL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AADAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AANAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
grouped_matrix.shape

(2610, 5365)

## Save Binary Matrix

In [70]:
grouped_matrix.to_csv('Output/DrugBank_Targets.csv')

In [71]:
filename = 'Output/DrugBank_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT

In [24]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    
#     for i, index in enumerate(grouped_matrix.index):
#         progressPercent = ((i+1)/len(grouped_matrix.index))*100
#         sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(grouped_matrix.index)))
#         sys.stdout.flush()
    
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    drugset_library.append(drugset)
    
dsl = np.array(drugset_library)

In [25]:
print(dsl)

[list(['A1BG', '', 'Copper', 'Zinc'])
 list(['A2M', '', 'Bacitracin', 'Becaplermin', 'Cisplatin', 'Ocriplasmin', 'Zinc'])
 list(['AADACL2', '', 'GIBBERELLIN A3', 'GIBBERELLIN A4']) ...
 list(['ZAP70', '', 'Staurosporine'])
 list(['ZFY', '', 'Beta-Cyclohexyl-Alanine', 'PCL-016'])
 list(['ZYX', '', 'Artenimol'])]


In [28]:
len(max(dsl, key=len))

139

In [26]:
filename = 'Output/DrugBank_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   