# DSigDB Kinase Inhibitors

Author: Moshe Silverstein   
Date: 05-07-2018  
Data Source Home: http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/  
Data Source Download: http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/download.html  

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

In [3]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [4]:
importlib.reload(uf)

<module 'utility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/DSigDB/utility_functions.py'>

# Path to Output Files

In [5]:
path = '/Users/moshesilverstein/Documents/Harmonizome/DSigDB/Output/'

# Load Data

In [6]:
data = pd.read_csv('Input/DSigDB_All_detailed.txt', sep='\t')

In [7]:
data.head()

Unnamed: 0,Drug,Gene,Type,Source
0,citric acid,ABHD5,IC50=3.545(uM),D1 PubChem
1,citric acid,PLIN5,IC50=3.545(uM),D1 PubChem
2,citric acid,PLIN1,IC50=3.708(uM),D1 PubChem
3,citric acid,ABHD5,IC50=5.632(uM),D1 PubChem
4,citric acid,PLIN5,IC50=5.632(uM),D1 PubChem


In [8]:
data.shape

(688782, 4)

# Get Kinase Inhibitor Data

In [9]:
lst = []

for index in data.index:
    if type(data.loc[index, 'Source']) == str:
        if 'FDA' in data.loc[index, 'Source'] or 'Kinome Scan' in data.loc[index, 'Source'] or 'LINCS' in data.loc[index, 'Source'] or 'MRC' in data.loc[index, 'Source'] or 'GSK' in data.loc[index, 'Source'] or 'Roche' in data.loc[index, 'Source'] or 'RBC' in data.loc[index, 'Source']:
            lst.append(index)
            
data = data.loc[lst, :]

In [10]:
data.head()

Unnamed: 0,Drug,Gene,Type,Source
23883,"2-(2-Quinolin-3-Ylpyridin-4-Yl)-1,5,6,7-Tetrah...",DYRK1A,Ki=3.98107(nM),Roche
23884,GSK978744A,PLK1,POC=7.85(0.1uM),GSK
23885,GSK978744A,PLK1,POC=6.47(1uM),GSK
23886,GSK978744A,STK10,POC=14.26(1uM),GSK
23887,GSK978744A,NEK9,POC=8.59(1uM),GSK


In [11]:
data.shape

(22082, 4)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [12]:
data.set_index('Gene', inplace=True)

In [13]:
uf.mapgenesymbols(data)

Progeres: 98%  21818 Out of 22082   

In [14]:
data.shape

# Drop Duplicates

In [15]:
data.reset_index(inplace=True)

In [16]:
data.drop_duplicates(subset=['Gene', 'Drug'], inplace=True)

In [17]:
data.shape

# Create Binary Matrix

In [18]:
binary_matrix = uf.createBinaryMatrix(data[['Gene', 'Drug']])

Progeres: 100%  404 Out of 404   

In [19]:
lst = [x.upper() for x in binary_matrix.columns]

binary_matrix.columns = lst

In [20]:
binary_matrix = binary_matrix.T.groupby(level=0).max().T

In [21]:
binary_matrix.head()

Unnamed: 0,(5Z)-7-OXOZEAENOL,(E)-N-METHYL-2-((3-(2-(PYRIDIN-2-YL)VINYL)-1H-INDAZOL-6-YL)THIO)BENZAMIDE,(R)-ROSCOVITINE,1 NA-PP1 (PP1 ANALOG),1 NM-PP1 (PP1 ANALOG II),"1,2,4-TRIAZOLO[4,3-B]PYRIDAZINE, 3-(3-NITROPHENYL)-6-(1-PIPERIDINYL)-","1,9-PYRAZOLOANTHRONE","1-(3-METHYLPHENYL)-3-[4-(1-OXO-2,3-DIHYDROISOINDOL-4-YL)PHENYL]UREA","1-(4-AMINO-1,2,5-OXADIAZOL-3-YL)-N-[(2-ETHOXYPHENYL)METHYLIDENEAMINO]-5-PYRROLIDIN-1-YLTRIAZOLE-4-CARBOXAMIDE",1-AZAKENPAULLONE,...,XMD8-92,Y-27632,ZEARALENONE,ZINC00027781,ZINC00121334,ZINC04457925,ZINC04494446,ZINC04547409,ZM 336372,ZM-447439
EPHA2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
BRSK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WEE2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDK7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
MKNK1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
binary_matrix.shape

(404, 1150)

# Save Binary Matrix

In [23]:
filename = path+'dsigdb_kinase_inhibitors_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [24]:
name = 'dsigdb_kinase_inhibitors_gene_set'

In [25]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1150 Out of 1150   

# Create Attribute Library

In [26]:
name = 'dsigdb_kinase_inhibitors_attribute_set'

In [27]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  404 Out of 404   

# Create Gene Similarity Matrix

In [28]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [29]:
gene_similarity_matix.head()

Unnamed: 0,EPHA2,BRSK1,WEE2,CDK7,MKNK1,MAP3K10,PRKAA1,NEK6,STK35,MAP2K3,...,CIT,SRPK2,MAP2K4,TNK1,PRKACG,NEK11,CDC42BPB,PIK3CA,MELK,SIK2
,,,,,,,,,,,,,,,,,,,,,
EPHA2,1.0,0.126984,0.122449,0.080808,0.109375,0.114754,0.133333,0.033333,0.25,0.166667,...,0.164384,0.067797,0.144928,0.219512,0.020833,0.083333,0.12963,0.074627,0.12069,0.257143
BRSK1,0.126984,1.0,0.0,0.102564,0.162791,0.27027,0.163636,0.108108,0.125,0.166667,...,0.103448,0.135135,0.183673,0.128571,0.076923,0.033333,0.111111,0.108696,0.184783,0.218182
WEE2,0.122449,0.0,1.0,0.014493,0.0625,0.107143,0.043478,0.136364,0.151515,0.064516,...,0.090909,0.083333,0.076923,0.086207,0.0,0.363636,0.0,0.0,0.021978,0.133333
CDK7,0.080808,0.102564,0.014493,1.0,0.088608,0.106667,0.234568,0.115942,0.232877,0.164384,...,0.265823,0.181818,0.205128,0.210526,0.0,0.015152,0.101449,0.175676,0.208333,0.211765
MKNK1,0.109375,0.162791,0.0625,0.088608,1.0,0.175,0.103448,0.051282,0.148936,0.113636,...,0.142857,0.105263,0.115385,0.128571,0.037037,0.033333,0.081081,0.0625,0.135417,0.155172


# Save Gene Similarity Matrix

In [30]:
filename = path+'dsigdb_kinase_inhibitors_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [31]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [32]:
attribute_similarity_matix.head()

Unnamed: 0,(5Z)-7-OXOZEAENOL,(E)-N-METHYL-2-((3-(2-(PYRIDIN-2-YL)VINYL)-1H-INDAZOL-6-YL)THIO)BENZAMIDE,(R)-ROSCOVITINE,1 NA-PP1 (PP1 ANALOG),1 NM-PP1 (PP1 ANALOG II),"1,2,4-TRIAZOLO[4,3-B]PYRIDAZINE, 3-(3-NITROPHENYL)-6-(1-PIPERIDINYL)-","1,9-PYRAZOLOANTHRONE","1-(3-METHYLPHENYL)-3-[4-(1-OXO-2,3-DIHYDROISOINDOL-4-YL)PHENYL]UREA","1-(4-AMINO-1,2,5-OXADIAZOL-3-YL)-N-[(2-ETHOXYPHENYL)METHYLIDENEAMINO]-5-PYRROLIDIN-1-YLTRIAZOLE-4-CARBOXAMIDE",1-AZAKENPAULLONE,...,XMD8-92,Y-27632,ZEARALENONE,ZINC00027781,ZINC00121334,ZINC04457925,ZINC04494446,ZINC04547409,ZM 336372,ZM-447439
,,,,,,,,,,,,,,,,,,,,,
(5Z)-7-OXOZEAENOL,1.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04
(E)-N-METHYL-2-((3-(2-(PYRIDIN-2-YL)VINYL)-1H-INDAZOL-6-YL)THIO)BENZAMIDE,0.071429,1.0,0.0,0.0,0.0,0.0,0.111111,0.125,0.0,0.0,...,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.108696
(R)-ROSCOVITINE,0.0,0.0,1.0,0.055556,0.047619,0.066667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.076923,0.071429,0.066667,0.076923,0.0,0.055556
1 NA-PP1 (PP1 ANALOG),0.0,0.0,0.055556,1.0,0.666667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.041667
1 NM-PP1 (PP1 ANALOG II),0.0,0.0,0.047619,0.666667,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272727,0.039216


# Save Attribute Similarity Matrix

In [33]:
filename = path+'dsigdb_kinase_inhibitors_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# HeatMap (clustergrammer) of Similarity Matrix

In [35]:
# net.load_df(attribute_similarity_matix.iloc[0:100,0:100].copy())
# # net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

# Create Gene List

In [36]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  404 Out of 404   

In [37]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,EPHA2,1969
1,BRSK1,84446
2,WEE2,494551
3,CDK7,1022
4,MKNK1,8569


In [38]:
gene_list.shape

(404, 2)

# Save Gene List

In [39]:
filename = path+'dsigdb_kinase_inhibitors_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

### Create Metadata Table

In [40]:
data.head()

Unnamed: 0,Gene,Drug,Type,Source
0,DYRK1A,"2-(2-Quinolin-3-Ylpyridin-4-Yl)-1,5,6,7-Tetrah...",Ki=3.98107(nM),Roche
1,PLK1,GSK978744A,POC=7.85(0.1uM),GSK
3,STK10,GSK978744A,POC=14.26(1uM),GSK
4,NEK9,GSK978744A,POC=8.59(1uM),GSK
5,EGFR,Erlotinib,Kd=140.0(nM),FDA


In [41]:
temp = data[['Drug', 'Type', 'Source']].copy()

In [42]:
lst = [x.upper() for x in temp['Drug']]

temp['Drug'] = lst

In [43]:
temp.set_index('Drug', inplace=True)

In [47]:
source = []

metaData = pd.DataFrame(columns=['Source'], index=binary_matrix.columns)

for index in metaData.index:
    if type(temp.loc[index, 'Source']) == str:
        source.append(temp.loc[index, 'Source'])
    else:
        source.append((':').join(set([x for x in temp.loc[index, 'Source'].values.tolist()])))
    
metaData['Source'] = source

metaData.head()

Unnamed: 0,Source
,
(5Z)-7-OXOZEAENOL,MRC
(E)-N-METHYL-2-((3-(2-(PYRIDIN-2-YL)VINYL)-1H-INDAZOL-6-YL)THIO)BENZAMIDE,Roche
(R)-ROSCOVITINE,LINCS
1 NA-PP1 (PP1 ANALOG),MRC
1 NM-PP1 (PP1 ANALOG II),MRC


In [49]:
attribute_list = uf.createAttributeList(binary_matrix, metaData)

Progeres: 100%  1150 Out of 1150   

In [50]:
attribute_list.head()

Unnamed: 0_level_0,Source
Attributes,Unnamed: 1_level_1
(5Z)-7-OXOZEAENOL,MRC
(E)-N-METHYL-2-((3-(2-(PYRIDIN-2-YL)VINYL)-1H-INDAZOL-6-YL)THIO)BENZAMIDE,Roche
(R)-ROSCOVITINE,LINCS
1 NA-PP1 (PP1 ANALOG),MRC
1 NM-PP1 (PP1 ANALOG II),MRC


In [51]:
attribute_list.shape

(1150, 1)

# Save Attribute List

In [52]:
filename = path+'dsigdb_kinase_inhibitors_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [53]:
name = 'dsigdb_kinase_inhibitors_attribute_edge_list'

In [54]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  1150 Out of 1150   

 The number of statisticaly relevent gene-attribute associations is: 16240
