# Drugbank

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: http://www.drugbank.ca/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Drugbank/my_functions.py'>

# Load Data

In [74]:
df = pd.read_csv('Input/all.csv')

In [75]:
df.head()

Unnamed: 0,ID,Name,Gene Name,GenBank Protein ID,GenBank Gene ID,UniProt ID,Uniprot Title,PDB ID,GeneCard ID,GenAtlas ID,HGNC ID,Species,Drug IDs
0,P45059,Peptidoglycan synthase FtsI,ftsI,1574687.0,L42023,P45059,FTSI_HAEIN,,,,,Haemophilus influenzae (strain ATCC 51907 / DS...,DB00303
1,P19113,Histidine decarboxylase,HDC,32109.0,X54297,P19113,DCHS_HUMAN,4E1O,,HDC,HGNC:4855,Human,DB00114; DB00117
2,Q9UI32,"Glutaminase liver isoform, mitochondrial",GLS2,6650606.0,AF110330,Q9UI32,GLSL_HUMAN,4BQM,,GLS2,HGNC:29570,Human,DB00142
3,P00488,Coagulation factor XIII A chain,F13A1,182309.0,M22001,P00488,F13A_HUMAN,1EVU; 1EX0; 1F13; 1FIE; 1GGT; 1GGU; 1GGY; 1QRK...,,F13A1,HGNC:3531,Human,DB01839; DB02340; DB11311; DB13151
4,P35228,"Nitric oxide synthase, inducible",NOS2,292242.0,L09210,P35228,NOS2_HUMAN,1NSI; 2LL6; 2NSI; 3E7G; 3EJ8; 3HR4; 4CX7; 4NOS,,NOS2A,HGNC:7873,Human,DB00125; DB00155; DB01110; DB01234; DB01686; D...


In [77]:
df.shape

(4335, 13)

# Load Drug Meta Data

In [81]:
drug_meta = pd.read_csv('Input/drug links.csv', index_col=0)

In [82]:
drug_meta.head()

Unnamed: 0_level_0,Name,CAS Number,Drug Type,KEGG Compound ID,KEGG Drug ID,PubChem Compound ID,PubChem Substance ID,ChEBI ID,PharmGKB ID,HET ID,...,GenBank ID,DPD ID,RxList Link,Pdrhealth Link,Wikipedia ID,Drugs.com Link,NDC ID,ChemSpider ID,BindingDB ID,TTD ID
DrugBank ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DB00001,Lepirudin,138068-37-8,BiotechDrug,,D06880,,,,PA450195,,...,,11916.0,http://www.rxlist.com/cgi/generic/lepirudin.htm,,Lepirudin,http://www.drugs.com/cdi/lepirudin.html,,,,DAP000541
DB00002,Cetuximab,205923-56-4,BiotechDrug,,D03455,,,,PA10040,,...,J00228,13175.0,http://www.rxlist.com/cgi/generic3/erbitux.htm,,Cetuximab,http://www.drugs.com/cdi/cetuximab.html,,,,DNC000788
DB00003,Dornase alfa,143831-71-4,BiotechDrug,,,,,,PA10318,,...,M55983,650.0,http://www.rxlist.com/cgi/generic/pulmozyme.htm,,Dornase_alfa,http://www.drugs.com/cdi/dornase-alfa.html,,,,DAP000981
DB00004,Denileukin diftitox,173146-27-5,BiotechDrug,,,,,,PA164750594,,...,V01536,,http://www.rxlist.com/cgi/generic2/denileukin.htm,,Denileukin_diftitox,http://www.drugs.com/cdi/denileukin-diftitox.html,,,,DAP001098
DB00005,Etanercept,185243-69-0,BiotechDrug,C07897,D00742,,,,PA449515,,...,M32315,12032.0,http://www.rxlist.com/cgi/generic/etanercept.htm,,Etanercept,http://www.drugs.com/cdi/etanercept.html,,,,DNC000605


In [83]:
drug_meta.shape

(9588, 22)

# Get Relevant Data

In [89]:
# get only relevetn spcies

human = df[df['Species'] == 'Human'].copy()
mouse = df[df['Species'] == 'Mouse'].copy()
rat = df[df['Species'] == 'Rat'].copy()

df = pd.concat([human, mouse])
df = pd.concat([df, rat])

In [92]:
df = df[['Gene Name', 'Drug IDs']]

In [93]:
df.shape

(2510, 2)

In [94]:
df.head()

Unnamed: 0,Gene Name,Drug IDs
1,HDC,DB00114; DB00117
2,GLS2,DB00142
3,F13A1,DB01839; DB02340; DB11311; DB13151
4,NOS2,DB00125; DB00155; DB01110; DB01234; DB01686; D...
5,HSD17B2,DB00157


In [96]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    if type(df.ix[index, 'Gene Name']) != float:
        lst2 = df.ix[index, 'Drug IDs'].split(';')
        lst1 = [df.ix[index, 'Gene Name'].split('(')[0]]*(len(lst2))
        temp = pd.DataFrame()
        temp['Drug'] = lst2
        temp['Gene Name'] = lst1
        df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  2510 Out of 2510   

In [97]:
df_interactions.head()

Unnamed: 0,Drug,Gene Name
0,DB00114,HDC
1,DB00117,HDC
0,DB00142,GLS2
0,DB01839,F13A1
1,DB02340,F13A1


In [98]:
df_interactions.shape

(12292, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols¶

In [99]:
df_interactions.set_index('Gene Name', inplace=True)

In [100]:
mf.mapgenesymbols(df_interactions)

Progeres: 100%  12292 Out of 12292   

# Drop Duplicates

In [101]:
df_interactions.reset_index(inplace=True)

In [102]:
df_interactions.drop_duplicates(inplace=True)

In [103]:
df_interactions.shape

(12104, 2)

# Create Binary Matrix

In [104]:
binary_matrix = mf.createBinaryMatix(df_interactions)

Progeres: 100%  2265 Out of 2265   

In [105]:
binary_matrix.head()

Unnamed: 0,DB06995,DB04275,DB07231,DB05119,DB02336,DB07720,DB03286,DB00599,DB02378,DB04444,...,DB07001,DB05469,DB08687,DB08258,DB04209,DB08499,DB00013,DB08870,DB05227,DB03711
GSTM1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SELP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GHSR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WARS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ASIC3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
binary_matrix.shape

(2265, 5421)

# Save Binary Matrix

In [107]:
filename = '~/./Documents/Harmonizome/Drugbank/Output/drugbank_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [108]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Drugbank/Output/'

In [109]:
name = 'drugbank_gene_set'

In [110]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  5421 Out of 5421   

# Create Attribute Library

In [111]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Drugbank/Output/'

In [112]:
name = 'drugbank_attribute_set'

In [113]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  2265 Out of 2265   

# Create Gene Similarity Matrix

In [114]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [115]:
gene_similarity_matix.head()

Unnamed: 0,GSTM1,SELP,GHSR,WARS,ASIC3,SMPD1,ADA,LAYN,SLC5A6,SLC25A15,...,SCNN1G,ASL,GABRB3,LDHAL6B,OPLAH,NCOA2,RDH8,FLT4,PHGDH,MT-CYB
GSTM1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SELP,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GHSR,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WARS,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ASIC3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix

In [116]:
filename = '~/./Documents/Harmonizome/Drugbank/Output/drugbank_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [117]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [118]:
attribute_similarity_matix.head()

Unnamed: 0,DB06995,DB04275,DB07231,DB05119,DB02336,DB07720,DB03286,DB00599,DB02378,DB04444,...,DB07001,DB05469,DB08687,DB08258,DB04209,DB08499,DB00013,DB08870,DB05227,DB03711
DB06995,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB04275,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB07231,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB05119,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB02336,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [119]:
filename = '~/./Documents/Harmonizome/Drugbank/Output/drugbank_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [120]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  2265 Out of 2265   

In [121]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,GSTM1,2944
1,SELP,6403
2,GHSR,2693
3,WARS,7453
4,ASIC3,9311


In [122]:
gene_list.shape

(2265, 2)

# Save Gene List

In [123]:
filename = '~/./Documents/Harmonizome/Drugbank/Output/drugbank_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [124]:
attribute_list = mf.createAttributeList(binary_matrix)

In [125]:
attribute_list.head()

Unnamed: 0,Attributes
0,DB06995
1,DB04275
2,DB07231
3,DB05119
4,DB02336


In [126]:
attribute_list.shape

(5421, 1)

# Save Attribute List

In [127]:
filename = '~/./Documents/Harmonizome/Drugbank/Output/drugbank_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [128]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Drugbank/Output/'

In [129]:
name = 'drugbank_gene_attribute_edge_list'

In [130]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  5421 Out of 5421   

 The number of statisticaly relevent gene-attribute associations is: 12104
