# ARCHS4 (Transcription Factors)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
from collections import Counter
import json
import re
import scipy
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/untility_functions.py'>

# Load Data (coexpression data generated gmt file)

In [4]:
df = pd.read_csv('Input/Transcription Factors/ARCHS4_human_tf_Coexp.gmt', sep='$', header=None)

In [5]:
df.head()

Unnamed: 0,0
0,NFYA_human_tf_ARCHS4_coexpression\tSTK32C\tPLE...
1,ARX_human_tf_ARCHS4_coexpression\tGCG\tTM4SF4\...
2,HOXA11_human_tf_ARCHS4_coexpression\tISL2\tHOX...
3,SOX8_human_tf_ARCHS4_coexpression\tOLIG1\tOLIG...
4,ZFX_human_tf_ARCHS4_coexpression\tCTD-2336H13....


In [6]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 0].split('\t')[1:]
    lst1 = [df.ix[index, 0].split('\t')[0].split('_')[0]]*(len(lst2)-1)
    lst2.pop(0)
    temp = pd.DataFrame()
    temp['Trasncription Factor'] = lst1
    temp['Gene'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  1724 Out of 1724   

In [7]:
df_interactions.head()

Unnamed: 0,Trasncription Factor,Gene
0,NFYA,PLEKHA3P1
1,NFYA,CTD-2200P10.1
2,NFYA,RP11-19N8.2
3,NFYA,FAM201B
4,NFYA,ZKSCAN7


In [8]:
df_interactions.shape

(515476, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols¶

In [9]:
df_interactions.set_index('Gene', inplace=True)

In [10]:
uf.mapgenesymbols(df_interactions)

Progeres: 100%  515476 Out of 515476   

# Drop Duplicates

In [11]:
df_interactions.reset_index(inplace=True)

In [12]:
df_interactions.drop_duplicates(inplace=True)

In [13]:
df_interactions.shape

(470861, 2)

# Create Binary Matrix

In [15]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  21917 Out of 21917   

In [16]:
binary_matrix.head()

Unnamed: 0,ZNF19,ARNTL,HEY1,ZNF260,MAFG,POU6F2,NPAS1,WT1,PDS5B,ZNF257,...,FOXL2,BDP1,LARP4,NR2F6,CD36,DACH2,ID3,ZNF266,GATA6,ZNF90
ENDOD1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
TACC1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HNRNPA1P20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KITLG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PROC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
binary_matrix.shape

(21917, 1724)

# Save Binary Matrix

In [18]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_transcription_factor_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [19]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [20]:
name = 'archs4_transcription_factor_gene_set'

In [21]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1724 Out of 1724   

# Create Attribute Library

In [22]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [23]:
name = 'archs4_transcription_factor_attribute_set'

In [24]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  21917 Out of 21917   

# Create Gene Similarity Matrix

In [25]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [26]:
gene_similarity_matix.head()

Unnamed: 0,ENDOD1,TACC1,HNRNPA1P20,KITLG,PROC,MAFG,GTF2IP1,CLDN11,MYRF,DUSP13,...,DRD2,IDH3B,GMPPA,RIPK1,LARP4,ALYREF,FAM13A,ZNF266,DERL3,MYBPC3
ENDOD1,1.0,0.046512,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.020408,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TACC1,0.046512,1.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,0.0,0.023256
HNRNPA1P20,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KITLG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PROC,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.029412,0.016393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix

In [27]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_transcription_factor_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [28]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [29]:
attribute_similarity_matix.head()

Unnamed: 0,ZNF19,ARNTL,HEY1,ZNF260,MAFG,POU6F2,NPAS1,WT1,PDS5B,ZNF257,...,FOXL2,BDP1,LARP4,NR2F6,CD36,DACH2,ID3,ZNF266,GATA6,ZNF90
ZNF19,1.0,0.0,0.0,0.005693,0.0,0.202899,0.0,0.001992,0.018036,0.100239,...,0.001957,0.033597,0.0,0.0,0.0,0.009709,0.0,0.020325,0.0,0.068584
ARNTL,0.0,1.0,0.0,0.0,0.005172,0.0,0.0,0.001818,0.0,0.0,...,0.0,0.010619,0.0,0.0,0.010508,0.0,0.0,0.047619,0.0,0.0
HEY1,0.0,0.0,1.0,0.030035,0.005128,0.001818,0.015873,0.001802,0.007181,0.0,...,0.003552,0.001739,0.006826,0.0,0.005181,0.026882,0.02993,0.003617,0.007067,0.0
ZNF260,0.005693,0.0,0.030035,1.0,0.0,0.04,0.0,0.007313,0.085938,0.022088,...,0.001789,0.025135,0.003431,0.0,0.0,0.098646,0.012216,0.00365,0.001773,0.023121
MAFG,0.0,0.005172,0.005128,0.0,1.0,0.0,0.062731,0.0,0.0,0.0,...,0.029144,0.001739,0.005111,0.100935,0.005181,0.0,0.003431,0.001805,0.001757,0.0


# Save Attribute Similarity Matrix

In [30]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_transcription_factor_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [35]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  21917 Out of 21917   

In [36]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,ENDOD1,23052
1,TACC1,6867
2,HNRNPA1P20,344741
3,KITLG,4254
4,PROC,5624


In [37]:
gene_list.shape

(21917, 2)

# Save Gene List

In [38]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_transcription_factor_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [39]:
attribute_list = uf.createAttributeList(binary_matrix)

In [40]:
attribute_list.head()

Unnamed: 0,Attributes
0,ZNF19
1,ARNTL
2,HEY1
3,ZNF260
4,MAFG


In [41]:
attribute_list.shape

(1724, 1)

# Save Attribute List

In [42]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_transcription_factor_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [43]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [44]:
name = 'archs4_transcription_factor_gene_attribute_edge_list'

In [45]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  1724 Out of 1724   

 The number of statisticaly relevent gene-attribute associations is: 470861
