# DSigDB Perturbagen Signatures

Author: Moshe Silverstein   
Date: 05-07-2018  
Data Source Home: http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/  
Data Source Download: http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/download.html  

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

In [3]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [4]:
importlib.reload(uf)

<module 'utility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/DSigDB/utility_functions.py'>

# Path to Output Files

In [5]:
path = '/Users/moshesilverstein/Documents/Harmonizome/DSigDB/Output/'

# Load Data

In [6]:
data = pd.read_csv('Input/DSigDB_All_detailed.txt', sep='\t')

In [7]:
data.head()

Unnamed: 0,Drug,Gene,Type,Source
0,citric acid,ABHD5,IC50=3.545(uM),D1 PubChem
1,citric acid,PLIN5,IC50=3.545(uM),D1 PubChem
2,citric acid,PLIN1,IC50=3.708(uM),D1 PubChem
3,citric acid,ABHD5,IC50=5.632(uM),D1 PubChem
4,citric acid,PLIN5,IC50=5.632(uM),D1 PubChem


In [8]:
data.shape

(688782, 4)

# Get Perturbagen Signatures

In [11]:
lst = []

for index in data.index:
    if type(data.loc[index, 'Source']) == str:
        if 'CMAP' in data.loc[index, 'Source']:
            lst.append(index) 
            
data = data.loc[lst, :]

In [12]:
data.head()

Unnamed: 0,Drug,Gene,Type,Source
45966,(+)-chelidonine,ABHD2,0.0000108=UP(HL60),CMAP
45967,(+)-chelidonine,ACSL1,0.0000108=UP(HL60),CMAP
45968,(+)-chelidonine,ADD3,0.0000108=UP(HL60),CMAP
45969,(+)-chelidonine,ADORA3,0.0000108=UP(HL60),CMAP
45970,(+)-chelidonine,AFF1,0.0000108=UP(HL60),CMAP


In [13]:
data.shape

(190206, 4)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [14]:
data.set_index('Gene', inplace=True)

In [15]:
uf.mapgenesymbols(data)

Progeres: 100%  190206 Out of 190206   

In [16]:
data.shape

(187604, 3)

# Drop Duplicates

In [17]:
data.reset_index(inplace=True)

In [18]:
data.drop_duplicates(subset=['Gene', 'Drug'], inplace=True)

In [19]:
data.shape

(158860, 4)

# Create Binary Matrix

In [20]:
binary_matrix = uf.createBinaryMatrix(data[['Gene', 'Drug']])

Progeres: 100%  10933 Out of 10933   

In [21]:
lst = [x.upper() for x in binary_matrix.columns]

binary_matrix.columns = lst

In [22]:
binary_matrix = binary_matrix.T.groupby(level=0).max().T

In [23]:
binary_matrix.head()

Unnamed: 0,(+)-CHELIDONINE,(+)-ISOPRENALINE,(-)-ATENOLOL,(-)-ISOPRENALINE,(-)-MK-801,0173570-0000,0175029-0000,0179445-0000,0198306-0000,0225151-0000,...,YOHIMBIC ACID,YOHIMBINE,ZALCITABINE,ZAPRINAST,ZARDAVERINE,ZIDOVUDINE,ZIMELDINE,ZOMEPIRAC,ZOXAZOLAMINE,ZUCLOPENTHIXOL
ZNF407,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
SV2B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZNF16,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LRCH3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PALMD,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
binary_matrix.shape

(10933, 1163)

# Save Binary Matrix

In [25]:
filename = path+'dsigdb_pertubagen_signatures_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [26]:
name = 'dsigdb_pertubagen_signatures_gene_set'

In [27]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1163 Out of 1163   

# Create Attribute Library

In [28]:
name = 'dsigdb_pertubagen_signatures_attribute_set'

In [29]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  10933 Out of 10933   

# Create Gene Similarity Matrix

In [30]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [31]:
gene_similarity_matix.head()

Unnamed: 0,ZNF407,SV2B,ZNF16,LRCH3,PALMD,ATP6V1G2-DDX39B,ETHE1,NINL,NBPF7,ZNF266,...,TREX1,MXD3,ELOVL6,KCNB2,IL2,AGBL3,MAGEA1,GRK6,SCAMP3,TSFM
,,,,,,,,,,,,,,,,,,,,,
ZNF407,1.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.173077,0.346154,0.384615,0.2,0.0,0.125,0.0625,0.133333
SV2B,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,...,0.0,0.0,0.0,0.0,0.047619,0.0,1.0,0.0,0.0,0.052632
ZNF16,0.0,0.0,1.0,0.333333,0.208333,0.108108,0.083333,0.0,0.194444,0.318182,...,0.05,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.074074
LRCH3,0.0,0.0,0.333333,1.0,0.178571,0.125,0.0,0.047619,0.236842,0.32,...,0.136364,0.0625,0.016949,0.0,0.0,0.0,0.0,0.0,0.0,0.03125
PALMD,0.030303,0.0,0.208333,0.178571,1.0,0.086957,0.1,0.0,0.106383,0.1875,...,0.034483,0.0,0.031746,0.054054,0.052632,0.047619,0.0,0.025641,0.05,0.027027


# Save Gene Similarity Matrix

In [32]:
filename = path+'dsigdb_pertubagen_signatures_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [33]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [34]:
attribute_similarity_matix.head()

Unnamed: 0,(+)-CHELIDONINE,(+)-ISOPRENALINE,(-)-ATENOLOL,(-)-ISOPRENALINE,(-)-MK-801,0173570-0000,0175029-0000,0179445-0000,0198306-0000,0225151-0000,...,YOHIMBIC ACID,YOHIMBINE,ZALCITABINE,ZAPRINAST,ZARDAVERINE,ZIDOVUDINE,ZIMELDINE,ZOMEPIRAC,ZOXAZOLAMINE,ZUCLOPENTHIXOL
,,,,,,,,,,,,,,,,,,,,,
(+)-CHELIDONINE,1.0,0.100917,0.0,0.127883,0.006173,0.027027,0.017338,0.021327,0.011236,0.005917,...,0.003717,0.0,0.018311,0.0,0.137931,0.008403,0.010526,0.0,0.0,0.021521
(+)-ISOPRENALINE,0.100917,1.0,0.0,0.161616,0.011905,0.102941,0.008912,0.005795,0.0,0.010989,...,0.005236,0.0,0.007344,0.0,0.2,0.006211,0.00885,0.0,0.0,0.0144
(-)-ATENOLOL,0.0,0.0,1.0,0.002525,0.0,0.0,0.00273,0.0,0.026316,0.0,...,0.057377,0.0,0.003805,0.0,0.005319,0.178571,0.02,0.055556,0.0,0.016014
(-)-ISOPRENALINE,0.127883,0.161616,0.002525,1.0,0.0,0.044289,0.037551,0.037723,0.007557,0.0,...,0.00616,0.0,0.034043,0.0,0.244344,0.004367,0.004878,0.0,0.002591,0.026432
(-)-MK-801,0.006173,0.011905,0.0,0.0,1.0,0.0,0.000273,0.0,0.0,0.153846,...,0.0,0.0,0.0,0.0,0.0,0.024096,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [35]:
filename = path+'dsigdb_pertubagen_signatures_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# HeatMap (clustergrammer) of Similarity Matrix

In [36]:
# net.load_df(attribute_similarity_matix.iloc[0:100,0:100].copy())
# # net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

# Create Gene List

In [37]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  10933 Out of 10933   

In [38]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,ZNF407,55628
1,SV2B,9899
2,ZNF16,7564
3,LRCH3,84859
4,PALMD,54873


In [39]:
gene_list.shape

(10933, 2)

# Save Gene List

In [40]:
filename = path+'dsigdb_pertubagen_signatures_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

### Create Metadata Table

In [41]:
data.head()

Unnamed: 0,Gene,Drug,Type,Source
0,ABHD2,(+)-chelidonine,0.0000108=UP(HL60),CMAP
1,ACSL1,(+)-chelidonine,0.0000108=UP(HL60),CMAP
2,ADD3,(+)-chelidonine,0.0000108=UP(HL60),CMAP
3,ADORA3,(+)-chelidonine,0.0000108=UP(HL60),CMAP
4,AFF1,(+)-chelidonine,0.0000108=UP(HL60),CMAP


In [42]:
temp = data[['Drug', 'Type', 'Source']].copy()

In [43]:
lst = [x.upper() for x in temp['Drug']]

temp['Drug'] = lst

In [44]:
temp.set_index('Drug', inplace=True)

In [45]:
source = []

metaData = pd.DataFrame(columns=['Source'], index=binary_matrix.columns)

for index in metaData.index:
    if type(temp.loc[index, 'Source']) == str:
        source.append(temp.loc[index, 'Source'])
    else:
        source.append((':').join(set([x for x in temp.loc[index, 'Source'].values.tolist()])))
    
metaData['Source'] = source

metaData.head()

Unnamed: 0,Source
,
(+)-CHELIDONINE,CMAP
(+)-ISOPRENALINE,CMAP
(-)-ATENOLOL,CMAP
(-)-ISOPRENALINE,CMAP
(-)-MK-801,CMAP


In [46]:
attribute_list = uf.createAttributeList(binary_matrix, metaData)

Progeres: 100%  1163 Out of 1163   

In [47]:
attribute_list.head()

Unnamed: 0_level_0,Source
Attributes,Unnamed: 1_level_1
(+)-CHELIDONINE,CMAP
(+)-ISOPRENALINE,CMAP
(-)-ATENOLOL,CMAP
(-)-ISOPRENALINE,CMAP
(-)-MK-801,CMAP


In [48]:
attribute_list.shape

(1163, 1)

# Save Attribute List

In [49]:
filename = path+'dsigdb_pertubagen_signatures_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [50]:
name = 'dsigdb_pertubagen_signatures_attribute_edge_list'

In [51]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  1163 Out of 1163   

 The number of statisticaly relevent gene-attribute associations is: 158860
