# ESCAPE (gene target)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: http://www.maayanlab.net/ESCAPE/download.php

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/ESCAPE/untility_functions.py'>

# Load Data

In [26]:
df = pd.read_csv('Input/chip_x.txt', sep='\t')

In [28]:
df.head(10)

Unnamed: 0,sourceName,sourceID,targetName,targetID,pmid,chipType,cellType,modification_date
0,CHD7,320790,APOA1,11806,19251738,ChIP-chip,MESC,11/5/2010
1,CHD7,320790,ARHGAP26,71302,19251738,ChIP-chip,MESC,11/5/2010
2,CHD7,320790,ATP11A,50770,19251738,ChIP-chip,MESC,11/5/2010
3,CHD7,320790,ATP5O,28080,19251738,ChIP-chip,MESC,11/5/2010
4,CHD7,320790,AXIN1,12005,19251738,ChIP-chip,MESC,11/5/2010
5,CHD7,320790,BCL11B,58208,19251738,ChIP-chip,MESC,11/5/2010
6,CHD7,320790,BMP4,12159,19251738,ChIP-chip,MESC,11/5/2010
7,CHD7,320790,BRWD1,93871,19251738,ChIP-chip,MESC,11/5/2010
8,CHD7,320790,BUD13,215051,19251738,ChIP-chip,MESC,11/5/2010
9,CHD7,320790,C21ORF59,0,19251738,ChIP-chip,MESC,11/5/2010


In [14]:
df = df[['sourceName', 'targetName']]

In [15]:
df.head()

Unnamed: 0,sourceName,targetName
0,CHD7,APOA1
1,CHD7,ARHGAP26
2,CHD7,ATP11A
3,CHD7,ATP5O
4,CHD7,AXIN1


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [16]:
df.set_index('sourceName', inplace=True)

In [17]:
uf.mapgenesymbols(df)

Progeres: 100%  133975 Out of 133975   

In [18]:
df.reset_index(inplace=True)

In [19]:
df.set_index('targetName', inplace=True)

In [20]:
uf.mapgenesymbols(df)

Progeres: 100%  129239 Out of 129239   

In [21]:
df.reset_index(inplace=True)

# Drop Duplicates

In [22]:
df.drop_duplicates(inplace=True)

In [23]:
df.shape

(81444, 2)

# Create Binary Matrix

In [24]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  13514 Out of 13514   

In [25]:
binary_matrix.head()

Unnamed: 0,NR0B1,CDX2,E2F4,KAT5,KLF2,NACC1,TRIM28,SMAD1,E2F1,SETDB1,...,MYC,CNOT3,EED,ZFX,ZIC3,PRDM14,EZH2,CHD7,SALL1,PHC1
COMMD10,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
MORC3,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
FAM122C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MARS2,0,0,1,0,0,0,0,0,0,1,...,1,1,0,1,0,0,0,0,0,0
SAT2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
binary_matrix.shape

(13514, 44)

# Save Binary Matrix

In [30]:
filename = '~/./Documents/Harmonizome/ESCAPE/Output/escape_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [32]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ESCAPE/Output/'

In [33]:
name = 'escape_gene_set'

In [34]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  44 Out of 44   

# Create Attribute Library

In [35]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ESCAPE/Output/'

In [36]:
name = 'escape_attribute_set'

In [37]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  13514 Out of 13514   

# Create Gene Similarity Matrix

In [38]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [39]:
gene_similarity_matix.head()

Unnamed: 0,COMMD10,MORC3,FAM122C,MARS2,SAT2,PWP1,KCTD11,KCNJ9,WAC,PCDHB5,...,DZIP1,EXT1,NPRL2,PRSS33,CPSF4L,IRAK2,CERCAM,TIMP3,TAF1D,NDUFB2
COMMD10,1.0,0.416667,0.0,0.272727,0.0,0.3,0.363636,0.076923,0.272727,0.0,...,0.272727,0.266667,0.5,0.0,0.0,0.0,0.142857,0.090909,0.333333,0.5
MORC3,0.416667,1.0,0.090909,0.357143,0.083333,0.285714,0.25,0.461538,0.266667,0.076923,...,0.357143,0.411765,0.416667,0.090909,0.083333,0.230769,0.083333,0.133333,0.214286,0.545455
FAM122C,0.0,0.090909,1.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.2,0.0,0.166667,0.0,0.166667
MARS2,0.272727,0.357143,0.0,1.0,0.0,0.363636,0.214286,0.066667,0.066667,0.0,...,0.142857,0.166667,0.4,0.0,0.0,0.3,0.111111,0.076923,0.4,0.166667
SAT2,0.0,0.083333,0.0,0.0,1.0,0.125,0.1,0.111111,0.0,0.0,...,0.111111,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857


# Save Gene Similarity Matrix

In [40]:
filename = '~/./Documents/Harmonizome/ESCAPE/Output/escape_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [41]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [42]:
attribute_similarity_matix.head()

Unnamed: 0,NR0B1,CDX2,E2F4,KAT5,KLF2,NACC1,TRIM28,SMAD1,E2F1,SETDB1,...,MYC,CNOT3,EED,ZFX,ZIC3,PRDM14,EZH2,CHD7,SALL1,PHC1
NR0B1,1.0,0.015423,0.137339,0.050456,0.021588,0.207903,0.121923,0.081081,0.11762,0.09878,...,0.137433,0.070902,0.034257,0.090051,0.083495,0.111275,0.061853,0.010027,0.009669,0.047878
CDX2,0.015423,1.0,0.01374,0.007117,0.005602,0.017361,0.016538,0.00915,0.008724,0.020427,...,0.008819,0.013468,0.036184,0.012069,0.008913,0.015974,0.041174,0.028061,0.008547,0.033605
E2F4,0.137339,0.01374,1.0,0.095559,0.009107,0.065222,0.245743,0.049778,0.306379,0.180657,...,0.440606,0.143518,0.03868,0.226568,0.029252,0.096275,0.077871,0.008797,0.009117,0.052855
KAT5,0.050456,0.007117,0.095559,1.0,0.004498,0.031196,0.062133,0.032412,0.067039,0.038946,...,0.097556,0.060128,0.012087,0.069522,0.018583,0.027315,0.011667,0.012766,0.010638,0.019216
KLF2,0.021588,0.005602,0.009107,0.004498,1.0,0.029412,0.011732,0.02418,0.008594,0.010678,...,0.009357,0.009901,0.018617,0.007678,0.013089,0.017657,0.011971,0.004484,0.0,0.023256


# Save Attribute Similarity Matrix

In [43]:
filename = '~/./Documents/Harmonizome/ESCAPE/Output/escape_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [44]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  13514 Out of 13514   

In [45]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,COMMD10,51397
1,MORC3,23515
2,FAM122C,159091
3,MARS2,92935
4,SAT2,112483


In [46]:
gene_list.shape

(13514, 2)

# Save Gene List

In [47]:
filename = '~/./Documents/Harmonizome/ESCAPE/Output/escape_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [48]:
attribute_list = uf.createAttributeList(binary_matrix)

In [49]:
attribute_list.head()

Unnamed: 0,Attributes
0,NR0B1
1,CDX2
2,E2F4
3,KAT5
4,KLF2


In [50]:
attribute_list.shape

(44, 1)

# Save Attribute List

In [51]:
filename = '~/./Documents/Harmonizome/ESCAPE/Output/escape_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [52]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ESCAPE/Output/'

In [53]:
name = 'escape_gene_attribute_edge_list'

In [54]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  44 Out of 44   

 The number of statisticaly relevent gene-attribute associations is: 81427
