## Import Libraries

In [2]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os

## Load Data

#### Load DrugBank Data

In [3]:
df = pd.read_csv('Input/drugbank_targets.csv')

In [4]:
df.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00001,Lepirudin,BiotechDrug,P00734,Prothrombin
1,DB00002,Cetuximab,BiotechDrug,P00533,Epidermal growth factor receptor
2,DB00002,Cetuximab,BiotechDrug,O75015,Low affinity immunoglobulin gamma Fc region re...
3,DB00002,Cetuximab,BiotechDrug,P00736,Complement C1r subcomponent
4,DB00002,Cetuximab,BiotechDrug,P02745,Complement C1q subcomponent subunit A


In [5]:
df.shape

(18655, 5)

In [6]:
df['Type'].unique()

array(['BiotechDrug', 'SmallMoleculeDrug'], dtype=object)

In [7]:
len(df['Name'].unique())

7137

#### Load UniProt ID table

In [8]:
uniprot = pd.read_table('Input/gene_to_uniprot.txt')

In [9]:
uniprot.head()

Unnamed: 0,Approved Symbol,UniProt ID
0,A1BG,P04217
1,A1BG-AS1,
2,A1CF,Q9NQ94
3,A2M,P01023
4,A2M-AS1,


In [10]:
uniprot.shape

(41375, 2)

In [11]:
uniprot = uniprot[pd.notnull(uniprot['UniProt ID'])]

## Map UniProt ID to Gene

In [12]:
df.set_index('UniProt ID', inplace = True)
uniprot.set_index('UniProt ID',inplace=True)

In [13]:
uniprot.head()

Unnamed: 0_level_0,Approved Symbol
UniProt ID,Unnamed: 1_level_1
P04217,A1BG
Q9NQ94,A1CF
P01023,A2M
A8K2U0,A2ML1
U3KPV4,A3GALT2


In [14]:
df = pd.merge(df, uniprot, how= 'left', on = 'UniProt ID')

## Make Binary Matrix

In [15]:
df = df.rename(index=str, columns = {'Approved Symbol':'Targets'})

In [16]:
df.reset_index(inplace=True)

In [17]:
df.head()

Unnamed: 0,UniProt ID,DrugBank ID,Name,Type,UniProt Name,Targets
0,P00734,DB00001,Lepirudin,BiotechDrug,Prothrombin,F2
1,P00533,DB00002,Cetuximab,BiotechDrug,Epidermal growth factor receptor,EGFR
2,O75015,DB00002,Cetuximab,BiotechDrug,Low affinity immunoglobulin gamma Fc region re...,FCGR3B
3,P00736,DB00002,Cetuximab,BiotechDrug,Complement C1r subcomponent,C1R
4,P02745,DB00002,Cetuximab,BiotechDrug,Complement C1q subcomponent subunit A,C1QA


In [18]:
grouped_df = df.groupby(['Targets'])['Name'].apply(lambda x: ',,,,,'.join(x.astype(str))).reset_index()

In [19]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()

In [20]:
len(df['Targets'].unique())

2611

In [21]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',,,,,')

In [22]:
grouped_matrix.head()

Unnamed: 0_level_0,'5'-O-(N-(L-Alanyl)-Sulfamoyl)Adenosine,'5'-O-(N-(L-Prolyl)-Sulfamoyl)Adenosine,"(1'r,2's)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-Yl)Adenine",(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)acetic acid,"(1-HYDROXYDODECANE-1,1-DIYL)BIS(PHOSPHONIC ACID)","(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)","(1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",(1-Methyl-1h-Imidazol-2-Yl)-(3-Methyl-4-{3-[(Pyridin-3-Ylmethyl)-Amino]-Propoxy}-Benzofuran-2-Yl)-Methanone,(1-Tert-Butyl-5-Hydroxy-1h-Pyrazol-4-Yl)-(6-Methanesulfonyl-4'-Methoxy-2-Methyl-Biphenyl-3-Yl)-Methanone,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYANDROST-4-EN-3-ONE",...,"{4-[(CARBOXYMETHOXY)CARBONYL]-3,3-DIOXIDO-1-OXONAPHTHO[1,2-D]ISOTHIAZOL-2(1H)-YL}ACETIC ACID","{4-[2,2-BIS(5-METHYL-1,2,4-OXADIAZOL-3-YL)-3-PHENYLPROPYL]PHENYL}SULFAMIC ACID","{4-[2-Acetylamino-2-(3-Carbamoyl-2-Cyclohexylmethoxy-6,7,8,9-Tetrahydro-5h-Benzocyclohepten-5ylcarbamoyl)-Ethyl]-2-Phosphono-Phenyl}-Phosphonic Acid",{4-[2-BENZYL-3-METHOXY-2-(METHOXYCARBONYL)-3-OXOPROPYL]PHENYL}SULFAMIC ACID,{4-[3-(4-acetyl-3-hydroxy-2-propylphenoxy)propoxy]phenoxy}acetic acid,"{4-[3-(6,7-Diethoxy-Quinazolin-4-Ylamino)-Phenyl]-Thiazol-2-Yl}-Methanol","{[(2,6-difluorophenyl)carbonyl]amino}-N-(4-fluorophenyl)-1H-pyrazole-3-carboxamide","{[2-(1h-1,2,3-Benzotriazol-1-Yl)-2-(3,4-Difluorophenyl)Propane-1,3-Diyl]Bis[4,1-Phenylene(Difluoromethylene)]}Bis(Phosphonic Acid)","{[5-(5-nitro-2-furyl)-1,3,4-oxadiazol-2-yl]thio}acetic acid",{[7-(Difluoro-Phosphono-Methyl)-Naphthalen-2-Yl]-Difluoro-Methyl}-Phosphonic Acid
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AADACL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AADAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AANAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
grouped_matrix.shape

(2610, 5365)

## Save Binary Matrix

In [70]:
grouped_matrix.to_csv('Output/DrugBank_Targets.csv')

In [71]:
filename = 'Output/DrugBank_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT

In [24]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    
#     for i, index in enumerate(grouped_matrix.index):
#         progressPercent = ((i+1)/len(grouped_matrix.index))*100
#         sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(grouped_matrix.index)))
#         sys.stdout.flush()
    
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    drugset_library.append(drugset)
    
dsl = np.array(drugset_library)

In [25]:
print(dsl)

[list(['A1BG', '', 'Copper', 'Zinc'])
 list(['A2M', '', 'Bacitracin', 'Becaplermin', 'Cisplatin', 'Ocriplasmin', 'Zinc'])
 list(['AADACL2', '', 'GIBBERELLIN A3', 'GIBBERELLIN A4']) ...
 list(['ZAP70', '', 'Staurosporine'])
 list(['ZFY', '', 'Beta-Cyclohexyl-Alanine', 'PCL-016'])
 list(['ZYX', '', 'Artenimol'])]


In [28]:
len(max(dsl, key=len))

139

In [26]:
filename = 'Output/DrugBank_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   