# Chip Atlas

Author: Moshe Silverstein  
Date: 05-18  
Data Source Home: http://chip-atlas.org/  
Data Source Download: http://chip-atlas.org/target_genes  

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

In [3]:
# %load_ext version_information
# %version_information numpy, pandas, clustergrammer_widget, seaborn 

# Path to Output Files

In [29]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Chip-Atlas/Output/'

In [4]:
import urllib.request, json
url = "http://chip-atlas.org/data/target_genes_analysis.json"
x = urllib.request.urlopen(url)
raw_data = x.read()
encoding = x.info().get_content_charset('utf8')  # JSON default
data = json.loads(raw_data.decode(encoding))

In [5]:
specieslist = ["hg19", "mm9"]

In [6]:
tfs_list = []
targets = []

for species in specieslist:
    
    tfs = data[species]
    
    for i,tf in enumerate(tfs):
        
        distance = "1"
        
        progressPercent = ((i+1)/len(tfs))*100

        sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(tfs)))
        sys.stdout.flush()
        
        if distance == "1":
            try:
                url = "http://dbarchive.biosciencedbc.jp/kyushu-u/"+species+"/target/"+tf+"."+distance+".tsv"
                temp_df = pd.read_csv(url, sep='\t')
                target_list = list(temp_df['Target_genes'][0:2000].values)
                targets.extend(target_list)
                tfs_list.extend([tf]*len(target_list))
            except:
                distance = "5"

        if distance == "5":
            try:
                url = "http://dbarchive.biosciencedbc.jp/kyushu-u/"+species+"/target/"+tf+"."+distance+".tsv"
                temp_df = pd.read_csv(url, sep='\t')
                target_list = list(temp_df['Target_genes'][0:2000].values)
                targets.extend(target_list)
                tfs_list.extend([tf]*len(target_list))
            except:
                distance = "10"
                
        if distance == "10":
            try:
                url = "http://dbarchive.biosciencedbc.jp/kyushu-u/"+species+"/target/"+tf+"."+distance+".tsv"
                temp_df = pd.read_csv(url, sep='\t')
                target_list = list(temp_df['Target_genes'][0:2000].values)
                targets.extend(target_list)
                tfs_list.extend([tf]*len(target_list))
            except:
                pass
            
df = pd.DataFrame(columns=['TF', 'Target'])
df['TF'] = tfs_list
df['Target'] = targets

Progress: 100%  602 Out of 602   

In [7]:
df.head()

Unnamed: 0,TF,Target
0,ADNP2,MAMDC2
1,ADNP2,TRIM48
2,ADNP2,ISG15
3,ADNP2,COL23A1
4,ADNP2,AGRN


In [8]:
df.shape

(1580513, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [9]:
df.set_index('TF', inplace=True)

In [10]:
uf.mapgenesymbols(df)

Progeres: 100%  1580513 Out of 1580513   

In [11]:
df.reset_index(inplace=True)

In [12]:
df.shape

(1559153, 2)

In [13]:
df.set_index('Target', inplace=True)

In [14]:
uf.mapgenesymbols(df)

Progeres: 100%  1559153 Out of 1559153   

In [15]:
df.shape

(1488907, 1)

# Drop Duplicates

In [16]:
df.reset_index(inplace=True)

In [18]:
df.drop_duplicates(inplace=True)

In [19]:
df.shape

(1361487, 2)

In [20]:
df.shape

(1361487, 2)

In [17]:
df.head()

Unnamed: 0,Target,TF
0,MAMDC2,ADNP2
1,TRIM48,ADNP2
2,ISG15,ADNP2
3,COL23A1,ADNP2
4,AGRN,ADNP2


# Create Binary Matrix

In [26]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  18539 Out of 18539   

In [27]:
binary_matrix.head()

Unnamed: 0,CDK6,CTCFL,RXRA,SVIL,WRNIP1,LHX5,LCOR,SHOX2,ZNF266,PCGF2,...,NR2C2,ZBTB2,TSHZ1,TBL1Y,NKX3-1,IRF5,ATF4,HOXA5,TFCP2L1,ARID5B
OCEL1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CACNB3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
SVIL,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GPR173,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
SCFD1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [28]:
binary_matrix.shape

(18539, 963)

# Save Binary Matrix

In [30]:
filename = path+'chip_atlas_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [31]:
name = 'chip_atlas_gene_set'

In [32]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  963 Out of 963   

# Create Attribute Library

In [33]:
name = 'chip_atlas_attribute_set'

In [34]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18539 Out of 18539   

# Create Gene Similarity Matrix

In [36]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [37]:
gene_similarity_matix.head()

Unnamed: 0,OCEL1,CACNB3,SVIL,GPR173,SCFD1,ZNF320,GJA4,STMN4,SDC2,TPM4,...,SCRN1,UROC1,LAG3,DNHD1,TRIM62,ZBTB14,PRMT7,TRMT61B,GAB4,GTF2A1
,,,,,,,,,,,,,,,,,,,,,
OCEL1,1.0,0.153333,0.130081,0.020202,0.105058,0.03,0.010989,0.021739,0.08871,0.094488,...,0.079646,0.027778,0.053763,0.0,0.104478,0.067114,0.120536,0.126582,0.0,0.097973
CACNB3,0.153333,1.0,0.100775,0.019608,0.091255,0.019231,0.010638,0.021053,0.086614,0.126984,...,0.059322,0.00885,0.020202,0.0,0.126866,0.109589,0.08547,0.125,0.011364,0.111864
SVIL,0.130081,0.100775,1.0,0.044776,0.045455,0.014085,0.033898,0.016129,0.083333,0.102041,...,0.034091,0.111111,0.046875,0.018182,0.054054,0.04065,0.047619,0.063063,0.0,0.065217
GPR173,0.020202,0.019608,0.044776,1.0,0.028708,0.030303,0.045455,0.136364,0.015385,0.044776,...,0.06,0.02439,0.0,0.0,0.053333,0.034483,0.011111,0.020619,0.0,0.028112
SCFD1,0.105058,0.091255,0.045455,0.028708,1.0,0.018779,0.0,0.014634,0.06867,0.1,...,0.039648,0.004464,0.034146,0.0,0.035573,0.087649,0.241497,0.24918,0.0,0.346626


# Save Gene Similarity Matrix

In [38]:
filename = path+'chip_atlas_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [39]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [40]:
attribute_similarity_matix.head()

Unnamed: 0,CDK6,CTCFL,RXRA,SVIL,WRNIP1,LHX5,LCOR,SHOX2,ZNF266,PCGF2,...,NR2C2,ZBTB2,TSHZ1,TBL1Y,NKX3-1,IRF5,ATF4,HOXA5,TFCP2L1,ARID5B
,,,,,,,,,,,,,,,,,,,,,
CDK6,1.0,0.120583,0.168552,0.00504,0.127215,0.00252,0.0,0.0,0.002515,0.059726,...,0.162011,0.108714,0.0,0.191425,0.139286,0.004916,0.098794,0.010427,0.002479,0.007996
CTCFL,0.120583,1.0,0.160008,0.000367,0.051537,0.0,0.0,0.000369,0.001103,0.074105,...,0.084067,0.12629,0.000368,0.118421,0.118038,0.001806,0.107801,0.008023,0.004748,0.002556
RXRA,0.168552,0.160008,1.0,0.003455,0.133017,0.00157,0.0,0.000314,0.002511,0.070354,...,0.195869,0.151708,0.000314,0.169773,0.213821,0.002781,0.162458,0.007472,0.005303,0.005625
SVIL,0.00504,0.000367,0.003455,1.0,0.010193,0.266667,0.0,0.0,0.15,0.002388,...,0.004562,0.0,0.0,0.002028,0.002385,0.0,0.002535,0.065574,0.019608,0.175
WRNIP1,0.127215,0.051537,0.133017,0.010193,1.0,0.005669,0.0,0.001136,0.005643,0.060094,...,0.138179,0.05722,0.0,0.123223,0.130377,0.005336,0.076377,0.00974,0.002179,0.013289


# Save Attribute Similarity Matrix

In [41]:
filename = path+'chip_atlas_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [42]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  18539 Out of 18539   

In [43]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,OCEL1,79629
1,CACNB3,784
2,SVIL,6840
3,GPR173,54328
4,SCFD1,23256


In [44]:
gene_list.shape

(18539, 2)

# Save Gene List

In [45]:
filename = path+'chip_atlas_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [46]:
attribute_list = uf.createAttributeList(binary_matrix)

In [47]:
attribute_list.head()

CDK6
CTCFL
RXRA
SVIL
WRNIP1


In [48]:
attribute_list.shape

(963, 0)

# Save Attribute List

In [49]:
filename = path+'chip_atlas_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [50]:
name = 'chip_atlas_gene_attribute_edge_list'

In [51]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  963 Out of 963   

 The number of statisticaly relevent gene-attribute associations is: 1361218
