# DSigDB Computational Drug Signatures

Author: Moshe Silverstein   
Date: 05-07-2018  
Data Source Home: http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/  
Data Source Download: http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/download.html  

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

In [3]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [4]:
importlib.reload(uf)

<module 'utility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/DSigDB/utility_functions.py'>

# Path to Output Files

In [5]:
path = '/Users/moshesilverstein/Documents/Harmonizome/DSigDB/Output/'

# Load Data

In [6]:
data = pd.read_csv('Input/DSigDB_All_detailed.txt', sep='\t')

In [7]:
data.head()

Unnamed: 0,Drug,Gene,Type,Source
0,citric acid,ABHD5,IC50=3.545(uM),D1 PubChem
1,citric acid,PLIN5,IC50=3.545(uM),D1 PubChem
2,citric acid,PLIN1,IC50=3.708(uM),D1 PubChem
3,citric acid,ABHD5,IC50=5.632(uM),D1 PubChem
4,citric acid,PLIN5,IC50=5.632(uM),D1 PubChem


In [8]:
data.shape

(688782, 4)

# Get Perturbagen Signatures

In [9]:
lst = []

for index in data.index:
    if type(data.loc[index, 'Source']) == str:
        if 'BOSS' in data.loc[index, 'Source'] or 'CTD' in data.loc[index, 'Source'] or 'TTD' in data.loc[index, 'Source'] or 'D4 PubChem' in data.loc[index, 'Source'] or 'D4 ChEMBL' in data.loc[index, 'Source']:
            lst.append(index) 
            
data = data.loc[lst, :]

In [10]:
data.head()

Unnamed: 0,Drug,Gene,Type,Source
236173,Gatifloxacin,SLC9A6,"Text_Mining(PMID21764484,PMID21481984,PMID2380...",BOSS
236174,Gatifloxacin,KSR1,"Text_Mining(PMID21722249,PMID20499531,PMID2197...",BOSS
236175,Gatifloxacin,INS,"Text_Mining(PMID20068268,PMID20674567,PMID2320...",BOSS
236176,Gatifloxacin,ESR1,Potency-Replicate_1=26.6032(uM),D4 PubChem
236177,Gatifloxacin,HSD17B10,Potency=31622.8(nM),D4 ChEMBL


In [11]:
data.shape

(452605, 4)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [12]:
data.set_index('Gene', inplace=True)

In [13]:
uf.mapgenesymbols(data)

Progeres: 99%  452554 Out of 452605   

In [14]:
data.shape

(449549, 3)

# Drop Duplicates

In [15]:
data.reset_index(inplace=True)

In [16]:
data.drop_duplicates(subset=['Gene', 'Drug'], inplace=True)

In [17]:
data.shape

(293199, 4)

# Create Binary Matrix

In [18]:
binary_matrix = uf.createBinaryMatrix(data[['Gene', 'Drug']])

Progeres: 100%  18215 Out of 18215   

In [19]:
lst = [x.upper() for x in binary_matrix.columns]

binary_matrix.columns = lst

In [20]:
binary_matrix = binary_matrix.T.groupby(level=0).max().T

In [21]:
binary_matrix.head()

Unnamed: 0,( -)-CANDOXATRILAT,"(+)-(5ALPHA,7ALPHA,8BETA)-N-METHYL-N-[7-(1-PYRROLIDINYL)-1-OXASPIRO[4.5]DEC-8-YL]-BENZENEACETAMIDE",(+)-5-DEOXYADEENOPHORINE,(+)-BUTACLAMOL,(+)-EHNA,(+)-MK 801 MALEATE,(+)-MK-801 HYDROGEN MALEATE,(+)-NIGULDIPINE,(+)-QUISQUALIC ACID,(+/-)-CGP-12177A,...,[7-METHOXY-8-[2-METHYL-3-(3-METHYLBUT-2-ENYL)OXIRAN-2-YL]-2-OXASPIRO[2.5]OCTAN-6-YL] N-(2-CHLOROACETYL)CARBAMATE,[HYDROXY(3-PHENYLPROPYL)AMINO]METHANOL,[LYS8(ALEXA 488) ]PVA,[LYS8(ALEXA 546) ]PVA,"[N40,PRO1,TYR4,NLE 14]BB","[N40,PRO1,TYR4]BB",[R]-MESOPRAM,[TYR4]BOMBESIN,"[[4-(AMINOMETHYL)PHENYL]AMINO]OXO-ACETIC ACID,",{5-FLUORO-2-METHYL-1-[4-(METHYLSULFINYL)BENZYLIDENE]-1H-INDEN-3-YL}ACETIC ACID
BAIAP2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR51A7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DES,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MIR124-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OPN4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
binary_matrix.shape

(18215, 15819)

# Save Binary Matrix

In [23]:
filename = path+'dsigdb_computational_drug_signatures_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [24]:
name = 'dsigdb_computational_drug_signatures_gene_set'

In [25]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  15819 Out of 15819   

# Create Attribute Library

In [26]:
name = 'dsigdb_computational_drug_signatures_attribute_set'

In [27]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18215 Out of 18215   

# Create Gene Similarity Matrix

In [28]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [29]:
gene_similarity_matix.head()

Unnamed: 0,BAIAP2,OR51A7,DES,MIR124-2HG,OPN4,MCOLN2,UBE3C,TRIQK,CLC,MTFR2,...,CATSPERD,POM121L9P,C1D,GPR182,UCK2,IFI27,RELA,FAM98A,SPRR2E,MIR616
,,,,,,,,,,,,,,,,,,,,,
BAIAP2,1.0,0.1,0.034483,0.0,0.0,0.0,0.055556,0.0,0.0,0.142857,...,0.0,0.1,0.0,0.014337,0.115385,0.074074,0.019608,0.0,0.0,0.1
OR51A7,0.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002463,0.0,0.0,0.0
DES,0.034483,0.0,1.0,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,...,0.019231,0.0,0.0,0.106164,0.014706,0.045455,0.031674,0.0,0.0,0.02
MIR124-2HG,0.0,0.0,0.0,1.0,0.0,0.125,0.111111,0.111111,0.0,0.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002463,0.1,0.0,0.0
OPN4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix

In [30]:
filename = path+'dsigdb_computational_drug_signatures_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [31]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [32]:
attribute_similarity_matix.head()

Unnamed: 0,( -)-CANDOXATRILAT,"(+)-(5ALPHA,7ALPHA,8BETA)-N-METHYL-N-[7-(1-PYRROLIDINYL)-1-OXASPIRO[4.5]DEC-8-YL]-BENZENEACETAMIDE",(+)-5-DEOXYADEENOPHORINE,(+)-BUTACLAMOL,(+)-EHNA,(+)-MK 801 MALEATE,(+)-MK-801 HYDROGEN MALEATE,(+)-NIGULDIPINE,(+)-QUISQUALIC ACID,(+/-)-CGP-12177A,...,[7-METHOXY-8-[2-METHYL-3-(3-METHYLBUT-2-ENYL)OXIRAN-2-YL]-2-OXASPIRO[2.5]OCTAN-6-YL] N-(2-CHLOROACETYL)CARBAMATE,[HYDROXY(3-PHENYLPROPYL)AMINO]METHANOL,[LYS8(ALEXA 488) ]PVA,[LYS8(ALEXA 546) ]PVA,"[N40,PRO1,TYR4,NLE 14]BB","[N40,PRO1,TYR4]BB",[R]-MESOPRAM,[TYR4]BOMBESIN,"[[4-(AMINOMETHYL)PHENYL]AMINO]OXO-ACETIC ACID,",{5-FLUORO-2-METHYL-1-[4-(METHYLSULFINYL)BENZYLIDENE]-1H-INDEN-3-YL}ACETIC ACID
,,,,,,,,,,,,,,,,,,,,,
( -)-CANDOXATRILAT,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(+)-(5ALPHA,7ALPHA,8BETA)-N-METHYL-N-[7-(1-PYRROLIDINYL)-1-OXASPIRO[4.5]DEC-8-YL]-BENZENEACETAMIDE",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(+)-5-DEOXYADEENOPHORINE,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(+)-BUTACLAMOL,0.0,0.0,0.0,1.0,0.0,0.0,0.017241,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(+)-EHNA,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [33]:
filename = path+'dsigdb_computational_drug_signatures_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# HeatMap (clustergrammer) of Similarity Matrix

In [34]:
# net.load_df(attribute_similarity_matix.iloc[0:100,0:100].copy())
# # net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

# Create Gene List

In [35]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  18215 Out of 18215   

In [36]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,BAIAP2,10458.0
1,OR51A7,119687.0
2,DES,1674.0
3,MIR124-2HG,100130000.0
4,OPN4,94233.0


In [37]:
gene_list.shape

(18215, 2)

# Save Gene List

In [38]:
filename = path+'dsigdb_computational_drug_signatures_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

### Create Metadata Table

In [39]:
data.head()

Unnamed: 0,Gene,Drug,Type,Source
0,SLC9A6,Gatifloxacin,"Text_Mining(PMID21764484,PMID21481984,PMID2380...",BOSS
1,KSR1,Gatifloxacin,"Text_Mining(PMID21722249,PMID20499531,PMID2197...",BOSS
2,INS,Gatifloxacin,"Text_Mining(PMID20068268,PMID20674567,PMID2320...",BOSS
3,ESR1,Gatifloxacin,Potency-Replicate_1=26.6032(uM),D4 PubChem
4,HSD17B10,Gatifloxacin,Potency=31622.8(nM),D4 ChEMBL


In [40]:
temp = data[['Drug', 'Type', 'Source']].copy()

In [41]:
lst = [x.upper() for x in temp['Drug']]

temp['Drug'] = lst

In [42]:
temp.set_index('Drug', inplace=True)

In [43]:
source = []

metaData = pd.DataFrame(columns=['Source'], index=binary_matrix.columns)

for index in metaData.index:
    if type(temp.loc[index, 'Source']) == str:
        source.append(temp.loc[index, 'Source'])
    else:
        source.append((':').join(set([x for x in temp.loc[index, 'Source'].values.tolist()])))
    
metaData['Source'] = source

metaData.head()

Unnamed: 0,Source
,
( -)-CANDOXATRILAT,CTD
"(+)-(5ALPHA,7ALPHA,8BETA)-N-METHYL-N-[7-(1-PYRROLIDINYL)-1-OXASPIRO[4.5]DEC-8-YL]-BENZENEACETAMIDE",D4 PubChem
(+)-5-DEOXYADEENOPHORINE,TTD
(+)-BUTACLAMOL,D4 PubChem:TTD
(+)-EHNA,TTD


In [44]:
attribute_list = uf.createAttributeList(binary_matrix, metaData)

Progeres: 100%  15819 Out of 15819   

In [45]:
attribute_list.head()

Unnamed: 0_level_0,Source
Attributes,Unnamed: 1_level_1
( -)-CANDOXATRILAT,CTD
"(+)-(5ALPHA,7ALPHA,8BETA)-N-METHYL-N-[7-(1-PYRROLIDINYL)-1-OXASPIRO[4.5]DEC-8-YL]-BENZENEACETAMIDE",D4 PubChem
(+)-5-DEOXYADEENOPHORINE,TTD
(+)-BUTACLAMOL,D4 PubChem:TTD
(+)-EHNA,TTD


In [46]:
attribute_list.shape

(15819, 1)

# Save Attribute List

In [47]:
filename = path+'dsigdb_computational_drug_signatures_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [48]:
name = 'dsigdb_computational_drug_signatures_gene_attribute_edge_list'

In [49]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  15819 Out of 15819   

 The number of statisticaly relevent gene-attribute associations is: 293199
