# ENCODE (Transcription Factors - Binding Sites)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: https://www.encodeproject.org/matrix/?type=Experiment 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
import re
%matplotlib inline

In [38]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/untility_functions.py'>

# Load Data 

In [6]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/'
file = 'COMBINED/encode_transcrip_2017_08.gmt'
df = pd.read_csv(path+file, sep='~', header=None)

In [10]:
df.head()

Unnamed: 0,0
0,TAL1_K562_human\tNA\tRNF145\tRFXANK\tLSMEM1\tM...
1,CEBPB_K562_human\tNA\tFAM111A\tSERBP1\tSMARCAD...
2,STAT3_MCF 10A_human\tNA\tDIRC1\tMIR2117\tADAR\...
3,ZNF217_MCF-7_human\tNA\tSAAL1\tTMEM167B\tTMEM4...
4,MYC_K562_human\tNA\tHARBI1\tZBTB6\tDHRS4\tRASS...


In [23]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 0].split('\t')[1:]
    lst1 = [('_').join(df.ix[index, 0].split('\t')[0].split('_')[0:2])]*(len(lst2)-1)
    lst2.pop(0)
    temp = pd.DataFrame()
    temp['Transcription-Cell'] = lst1
    temp['Gene'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  1320 Out of 1320   

In [24]:
df_interactions.head()

Unnamed: 0,Transcription-Cell,Gene
0,TAL1_K562,RNF145
1,TAL1_K562,RFXANK
2,TAL1_K562,LSMEM1
3,TAL1_K562,MDN1
4,TAL1_K562,FGFR1OP


In [25]:
df_interactions.shape

(2576867, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [27]:
df_interactions.set_index('Gene', inplace=True)

In [28]:
uf.mapgenesymbols(df_interactions)

Progeres: 99%  2576770 Out of 2576867   

In [30]:
df_interactions.shape

(2576010, 1)

# Drop Duplicates

In [34]:
df_interactions.reset_index(inplace=True)

In [35]:
df_interactions.drop_duplicates(inplace=True)

In [36]:
df_interactions.shape

(2220608, 2)

# Create Binary Matrix

In [39]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  24656 Out of 24656   

In [40]:
binary_matrix.head()

Unnamed: 0,KDM5B_H1-hESC,RXRA_HepG2,GABPA_HepG2,STAT5A_K562,eGFP-ZNF654_HEK293,MTA2_GM12878,CTCF_forebrain,FLAG-HMG20B_isoform2,SIRT6_K562,NR2C1_K562,...,GATA3_MCF-7,MBD2_MCF-7,TCF7_HepG2,MYC_K562,TAF1_MCF-7,CTCF_fibroblast of upper leg skin,POLR2A_MEL cell line,MYOD1_C2C12,SIN3A_A549,eGFP-ZNF394_HEK293
OR6C2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZNF286A,0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
FAM133DP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
NKAP,0,1,1,1,1,1,0,0,0,1,...,1,1,0,1,0,0,0,0,1,1
AGXT,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [41]:
binary_matrix.shape

(24656, 1129)

# Save Binary Matrix

In [42]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_trans_fac_bind_sit_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [43]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/'

In [44]:
name = 'encode_trans_fac_bind_sit_gene_set'

In [45]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1129 Out of 1129   

# Create Attribute Library

In [46]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/'

In [47]:
name = 'encode_trans_fac_bind_sit_attribute_set'

In [48]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  24656 Out of 24656   

# Create Gene Similarity Matrix

In [49]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [50]:
gene_similarity_matix.head()

Unnamed: 0,OR6C2,ZNF286A,FAM133DP,NKAP,AGXT,NOXRED1,TNPO1,ADRM1,BYSL,TMPPE,...,CXorf21,SMAD1-AS1,MROH1,TUB,RASGEF1C,SNPH,CADM2-AS2,ELAVL1,PRKCA-AS1,CIDEA
OR6C2,1.0,0.0,0.012903,0.0,0.0,0.027778,0.004673,0.0125,0.006098,0.0,...,0.019608,0.033333,0.034483,0.042553,0.0,0.021277,0.0,0.0,0.0,0.03125
ZNF286A,0.0,1.0,0.073469,0.106101,0.041379,0.021429,0.078471,0.016304,0.084,0.041379,...,0.053333,0.0,0.066038,0.019737,0.006623,0.033557,0.007634,0.0,0.041096,0.017751
FAM133DP,0.012903,0.073469,1.0,0.14653,0.040462,0.017751,0.112205,0.023697,0.079137,0.011236,...,0.016304,0.024691,0.045082,0.033708,0.016854,0.02809,0.00625,0.017241,0.052326,0.005
NKAP,0.0,0.106101,0.14653,1.0,0.021407,0.009288,0.214527,0.039326,0.143577,0.030864,...,0.033333,0.019108,0.070681,0.024242,0.018237,0.021212,0.00639,0.031153,0.037152,0.028986
AGXT,0.0,0.041379,0.040462,0.021407,1.0,0.034483,0.036613,0.0,0.062147,0.014925,...,0.013514,0.0,0.014184,0.014085,0.029851,0.0,0.020833,0.0,0.014706,0.011364


# Save Gene Similarity Matrix 

In [51]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_trans_fac_bind_sit_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [52]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [53]:
attribute_similarity_matix.head()

Unnamed: 0,KDM5B_H1-hESC,RXRA_HepG2,GABPA_HepG2,STAT5A_K562,eGFP-ZNF654_HEK293,MTA2_GM12878,CTCF_forebrain,FLAG-HMG20B_isoform2,SIRT6_K562,NR2C1_K562,...,GATA3_MCF-7,MBD2_MCF-7,TCF7_HepG2,MYC_K562,TAF1_MCF-7,CTCF_fibroblast of upper leg skin,POLR2A_MEL cell line,MYOD1_C2C12,SIN3A_A549,eGFP-ZNF394_HEK293
KDM5B_H1-hESC,1.0,0.055409,0.054018,0.075847,0.075558,0.058216,0.057362,0.066951,0.070818,0.055966,...,0.066667,0.060727,0.068966,0.059042,0.069805,0.06553,0.056524,0.061024,0.06869,0.06868
RXRA_HepG2,0.055409,1.0,0.090215,0.114827,0.091107,0.079644,0.069519,0.112038,0.080054,0.108033,...,0.087548,0.092299,0.105307,0.100715,0.091107,0.089621,0.068662,0.075289,0.095524,0.092922
GABPA_HepG2,0.054018,0.090215,1.0,0.10742,0.076716,0.101046,0.077877,0.090215,0.109051,0.153403,...,0.096491,0.116695,0.09143,0.116383,0.107726,0.084011,0.075269,0.068109,0.100933,0.08521
STAT5A_K562,0.075847,0.114827,0.10742,1.0,0.115449,0.098325,0.068376,0.11142,0.120839,0.136364,...,0.113896,0.112656,0.112997,0.127396,0.114827,0.092299,0.070664,0.075868,0.112219,0.109293
eGFP-ZNF654_HEK293,0.075558,0.091107,0.076716,0.115449,1.0,0.117663,0.066382,0.087252,0.092123,0.112347,...,0.106807,0.081373,0.090835,0.082837,0.081373,0.09529,0.055966,0.07011,0.099425,0.120168


# Save Attribute Similarity Matrix

In [54]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_trans_fac_bind_sit_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [55]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  24656 Out of 24656   

In [56]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,OR6C2,341416
1,ZNF286A,57335
2,FAM133DP,728066
3,NKAP,79576
4,AGXT,189


In [57]:
gene_list.shape

(24656, 2)

# Save Gene List

In [58]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_trans_fac_bind_sit_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [59]:
attribute_list = uf.createAttributeList(binary_matrix)

In [60]:
attribute_list.head()

Unnamed: 0,Attributes
0,KDM5B_H1-hESC
1,RXRA_HepG2
2,GABPA_HepG2
3,STAT5A_K562
4,eGFP-ZNF654_HEK293


In [61]:
attribute_list.shape

(1129, 1)

# Save Attribute List

In [62]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_trans_fac_bind_sit_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [63]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/'

In [64]:
name = 'encode_trans_fac_bind_sit_gene_attribute_edge_list'

In [65]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  1129 Out of 1129   

 The number of statisticaly relevent gene-attribute associations is: 2220608
