# ENCODE (Transcription Factors)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: https://www.encodeproject.org/matrix/?type=Experiment

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
import re
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/untility_functions.py'>

# Load Data 

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/'
file = 'COMBINED/encode_transcrip_2017_08.gmt'
df = pd.read_csv(path+file, sep='~', header=None)

In [4]:
df.head()

Unnamed: 0,0
0,TAL1_K562_human\tNA\tRNF145\tRFXANK\tLSMEM1\tM...
1,CEBPB_K562_human\tNA\tFAM111A\tSERBP1\tSMARCAD...
2,STAT3_MCF 10A_human\tNA\tDIRC1\tMIR2117\tADAR\...
3,ZNF217_MCF-7_human\tNA\tSAAL1\tTMEM167B\tTMEM4...
4,MYC_K562_human\tNA\tHARBI1\tZBTB6\tDHRS4\tRASS...


In [5]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 0].split('\t')[1:]
    lst1 = [df.ix[index, 0].split('\t')[0].split('_')[0]]*(len(lst2)-1)
    lst2.pop(0)
    temp = pd.DataFrame()
    temp['Transcription-Cell'] = lst1
    temp['Gene'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  1320 Out of 1320   

In [6]:
df_interactions.head()

Unnamed: 0,Transcription-Cell,Gene
0,TAL1,RNF145
1,TAL1,RFXANK
2,TAL1,LSMEM1
3,TAL1,MDN1
4,TAL1,FGFR1OP


In [7]:
df_interactions.shape

(2576867, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [8]:
df_interactions.set_index('Gene', inplace=True)

In [9]:
uf.mapgenesymbols(df_interactions)

Progeres: 100%  2576867 Out of 2576867   

# Drop Duplicates

In [10]:
df_interactions.reset_index(inplace=True)

In [11]:
df_interactions.drop_duplicates(inplace=True)

In [12]:
df_interactions.shape

(1530578, 2)

# Create Binary Matrix

In [13]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  24656 Out of 24656   

In [14]:
binary_matrix.head()

Unnamed: 0,SREBF2,ARID3A,C11orf30,HNRNPL,POLR2AphosphoS2,GATAD2B,DDX20,POU5F1,NR3C1,GABPA,...,E2F6,RUNX3,YY1,FLAG-NFIA,FLAG-ZNF652,KDM1A,NCOR1,PHF8,WHSC1,NELFE
MC1R,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
MIRLET7C,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
PHF24,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
MRPS21,0,1,1,0,0,0,0,1,0,1,...,1,0,1,0,0,0,0,0,0,0
SSH1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [15]:
binary_matrix.shape

(24656, 456)

# Save Binary Matrix

In [16]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_transcription_factors_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [17]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/'

In [18]:
name = 'encode_transcription_factors_gene_set'

In [19]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  456 Out of 456   

# Create Attribute Library

In [20]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/'

In [21]:
name = 'encode_transcription_factors_attribute_set'

In [22]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  24656 Out of 24656   

# Create Gene Similarity Matrix

In [23]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [24]:
gene_similarity_matix.head()

Unnamed: 0,MC1R,MIRLET7C,PHF24,MRPS21,SSH1,FABP9,ALG5,PLXNB2,VNN2,MKL2,...,CDAN1,TUBA1A,TMEM141,CCDC154,USP10,SLC16A6,UBAP2,ALPPL2,EXO5,LINC00412
MC1R,1.0,0.035714,0.173469,0.155,0.081818,0.086957,0.217617,0.135593,0.150794,0.113821,...,0.133929,0.207207,0.131579,0.098901,0.176471,0.142857,0.246835,0.05814,0.162162,0.038462
MIRLET7C,0.035714,1.0,0.041667,0.0375,0.058824,0.029412,0.036585,0.061538,0.052632,0.028571,...,0.068966,0.045455,0.04918,0.060606,0.037736,0.078431,0.03937,0.04,0.041667,0.066667
PHF24,0.173469,0.041667,1.0,0.077778,0.078947,0.105263,0.1,0.054348,0.113402,0.086957,...,0.097561,0.154762,0.045455,0.086207,0.185714,0.169014,0.118881,0.058824,0.092025,0.1
MRPS21,0.155,0.0375,0.077778,1.0,0.11236,0.059172,0.281633,0.157609,0.185185,0.16129,...,0.163842,0.157609,0.188571,0.078313,0.150289,0.124294,0.20524,0.042945,0.306667,0.019108
SSH1,0.081818,0.058824,0.078947,0.11236,1.0,0.063492,0.134831,0.134831,0.098039,0.142857,...,0.13253,0.122222,0.103448,0.080645,0.101266,0.16,0.146853,0.074074,0.08982,0.021277


# Save Gene Similarity Matrix

In [25]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_transcription_factors_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [26]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [27]:
attribute_similarity_matix.head()

Unnamed: 0,SREBF2,ARID3A,C11orf30,HNRNPL,POLR2AphosphoS2,GATAD2B,DDX20,POU5F1,NR3C1,GABPA,...,E2F6,RUNX3,YY1,FLAG-NFIA,FLAG-ZNF652,KDM1A,NCOR1,PHF8,WHSC1,NELFE
SREBF2,1.0,0.175824,0.101908,0.101692,0.17679,0.145521,0.143788,0.129533,0.140881,0.176415,...,0.130414,0.098451,0.175024,0.097805,0.102993,0.168094,0.099764,0.165312,0.128122,0.114588
ARID3A,0.175824,1.0,0.129709,0.115405,0.278798,0.194139,0.193897,0.169771,0.182275,0.292962,...,0.16491,0.126999,0.279798,0.129105,0.125799,0.22975,0.124917,0.205134,0.168773,0.157826
C11orf30,0.101908,0.129709,1.0,0.088732,0.146327,0.122838,0.131366,0.10703,0.124478,0.155548,...,0.118387,0.121076,0.153441,0.119821,0.112347,0.111303,0.110525,0.116802,0.11597,0.101856
HNRNPL,0.101692,0.115405,0.088732,1.0,0.124085,0.12687,0.096143,0.088257,0.108824,0.12383,...,0.121297,0.084011,0.135926,0.092598,0.081081,0.104839,0.077607,0.116802,0.087519,0.094185
POLR2AphosphoS2,0.17679,0.278798,0.146327,0.124085,1.0,0.199958,0.19291,0.191587,0.197801,0.381148,...,0.200575,0.137165,0.33429,0.131447,0.139844,0.243531,0.13292,0.237333,0.184835,0.268009


# Save Attribute Similarity Matrix

In [28]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_transcription_factors_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [29]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  24656 Out of 24656   

In [30]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,MC1R,4157
1,MIRLET7C,406885
2,PHF24,23349
3,MRPS21,54460
4,SSH1,54434


In [31]:
gene_list.shape

(24656, 2)

# Save Gene List

In [32]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_transcription_factors_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [33]:
attribute_list = uf.createAttributeList(binary_matrix)

In [34]:
attribute_list.head()

Unnamed: 0,Attributes
0,SREBF2
1,ARID3A
2,C11orf30
3,HNRNPL
4,POLR2AphosphoS2


In [35]:
attribute_list.shape

(456, 1)

# Save Attribute List

In [36]:
filename = '~/./Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/encode_transcription_factors_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [37]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Encode_me_from_sctratch/Output/Transcription_factor/'

In [38]:
name = 'encode_transcription_factors_gene_attribute_edge_list'

In [39]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  456 Out of 456   

 The number of statisticaly relevent gene-attribute associations is: 1530489
