# Roadmap Epigenomics (Cell and Tissue Expression)

Author: Moshe Silverstein <br/>
Date: 11-17 <br/>
Data Source: http://www.roadmapepigenomics.org/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Roadmap Epigenomics/Cell and Tissue Gene Expression Profiles/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/57epigenomes.RPKM.pc.gz', sep='\t', index_col=False)

In [4]:
df.set_index('gene_id', inplace=True)

In [5]:
df.head()

Unnamed: 0_level_0,E000,E003,E004,E005,E006,E007,E011,E012,E013,E016,...,E114,E116,E117,E118,E119,E120,E122,E123,E127,E128
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,23.265,43.985,37.413,29.459,21.864,55.649,52.94,71.629,61.292,44.28,...,37.989,0.038,42.639,49.983,11.554,11.847,43.723,0.267,13.758,15.818
ENSG00000000005,0.872,1.642,6.498,0.0,0.157,0.003,0.115,0.087,0.055,1.577,...,0.0,0.0,0.0,0.0,0.0,0.018,0.0,0.006,0.0,0.0
ENSG00000000419,55.208,35.259,58.308,48.208,37.477,45.923,44.959,40.438,41.97,51.515,...,52.215,79.197,107.098,62.811,42.386,54.869,16.652,73.719,56.578,56.371
ENSG00000000457,3.237,2.596,2.345,8.775,2.723,3.7,3.912,5.011,4.158,3.292,...,4.829,11.082,8.814,2.646,2.483,2.527,2.549,7.651,4.967,3.714
ENSG00000000460,7.299,6.649,7.838,7.324,0.83,5.354,5.94,5.704,6.213,7.551,...,8.001,13.743,25.369,3.373,4.646,2.179,4.099,22.103,3.29,2.491


In [6]:
df.shape

(19795, 57)

# Load Sample Meta Data

In [7]:
sample_meta = pd.read_csv('Input/EG.name.txt', sep='\t', header=None, index_col=0)

In [8]:
sample_meta.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
E000,Universal_Human_Reference
E003,H1_Cell_Line
E004,H1_BMP4_Derived_Mesendoderm_Cultured_Cells
E005,H1_BMP4_Derived_Trophoblast_Cultured_Cells
E006,H1_Derived_Mesenchymal_Stem_Cells


In [9]:
sample_meta.shape

(58, 1)

# Load Gene Meta Data

In [10]:
gene_meta = pd.read_csv('Input/Ensembl_v65.Gencode_v10.ENSG.gene_info.txt', 
                        sep='\t', 
                        header=None, 
                        usecols=[0,6], 
                        index_col=0)

In [11]:
gene_meta.head()

Unnamed: 0_level_0,6
0,Unnamed: 1_level_1
ENSG00000000003,TSPAN6
ENSG00000000005,TNMD
ENSG00000000419,DPM1
ENSG00000000457,SCYL3
ENSG00000000460,C1orf112


In [12]:
gene_meta.shape

(52475, 1)

In [13]:
gene_meta.dropna(inplace=True)

In [14]:
gene_meta.shape

(26266, 1)

# Map Sample Name to Id

In [15]:
lst = []
for sample in df.columns:
    lst.append(sample_meta.loc[sample, 1])
df.columns = lst

In [16]:
df.head()

Unnamed: 0_level_0,Universal_Human_Reference,H1_Cell_Line,H1_BMP4_Derived_Mesendoderm_Cultured_Cells,H1_BMP4_Derived_Trophoblast_Cultured_Cells,H1_Derived_Mesenchymal_Stem_Cells,H1_Derived_Neuronal_Progenitor_Cultured_Cells,hESC_Derived_CD184+_Endoderm_Cultured_Cells,hESC_Derived_CD56+_Ectoderm_Cultured_Cells,hESC_Derived_CD56+_Mesoderm_Cultured_Cells,HUES64_Cell_Line,...,A549,GM12878,HELA,HEPG2,HMEC,HSMM,HUVEC,K562,NHEK,NHLF
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,23.265,43.985,37.413,29.459,21.864,55.649,52.94,71.629,61.292,44.28,...,37.989,0.038,42.639,49.983,11.554,11.847,43.723,0.267,13.758,15.818
ENSG00000000005,0.872,1.642,6.498,0.0,0.157,0.003,0.115,0.087,0.055,1.577,...,0.0,0.0,0.0,0.0,0.0,0.018,0.0,0.006,0.0,0.0
ENSG00000000419,55.208,35.259,58.308,48.208,37.477,45.923,44.959,40.438,41.97,51.515,...,52.215,79.197,107.098,62.811,42.386,54.869,16.652,73.719,56.578,56.371
ENSG00000000457,3.237,2.596,2.345,8.775,2.723,3.7,3.912,5.011,4.158,3.292,...,4.829,11.082,8.814,2.646,2.483,2.527,2.549,7.651,4.967,3.714
ENSG00000000460,7.299,6.649,7.838,7.324,0.83,5.354,5.94,5.704,6.213,7.551,...,8.001,13.743,25.369,3.373,4.646,2.179,4.099,22.103,3.29,2.491


# Map Gene Symbol to Id

In [17]:
lst = []

for gene in df.index:
    if gene in gene_meta.index and type(gene_meta.loc[gene, 6]) == str:
        lst.append(gene_meta.loc[gene, 6])
    else:
        df.drop(gene, axis=0, inplace=True)
    
df.index = lst

In [18]:
df.head()

Unnamed: 0,Universal_Human_Reference,H1_Cell_Line,H1_BMP4_Derived_Mesendoderm_Cultured_Cells,H1_BMP4_Derived_Trophoblast_Cultured_Cells,H1_Derived_Mesenchymal_Stem_Cells,H1_Derived_Neuronal_Progenitor_Cultured_Cells,hESC_Derived_CD184+_Endoderm_Cultured_Cells,hESC_Derived_CD56+_Ectoderm_Cultured_Cells,hESC_Derived_CD56+_Mesoderm_Cultured_Cells,HUES64_Cell_Line,...,A549,GM12878,HELA,HEPG2,HMEC,HSMM,HUVEC,K562,NHEK,NHLF
TSPAN6,23.265,43.985,37.413,29.459,21.864,55.649,52.94,71.629,61.292,44.28,...,37.989,0.038,42.639,49.983,11.554,11.847,43.723,0.267,13.758,15.818
TNMD,0.872,1.642,6.498,0.0,0.157,0.003,0.115,0.087,0.055,1.577,...,0.0,0.0,0.0,0.0,0.0,0.018,0.0,0.006,0.0,0.0
DPM1,55.208,35.259,58.308,48.208,37.477,45.923,44.959,40.438,41.97,51.515,...,52.215,79.197,107.098,62.811,42.386,54.869,16.652,73.719,56.578,56.371
SCYL3,3.237,2.596,2.345,8.775,2.723,3.7,3.912,5.011,4.158,3.292,...,4.829,11.082,8.814,2.646,2.483,2.527,2.549,7.651,4.967,3.714
C1orf112,7.299,6.649,7.838,7.324,0.83,5.354,5.94,5.704,6.213,7.551,...,8.001,13.743,25.369,3.373,4.646,2.179,4.099,22.103,3.29,2.491


In [19]:
df.shape

(18792, 57)

# Save Unfiltered Matrix To File

In [20]:
filename = '~/./Documents/Harmonizome/Roadmap Epigenomics/Cell and Tissue Gene Expression Profiles/Output/roadmap_cell_tissue_exprs_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
df.to_csv(filename, sep='\t', compression='gzip')

# Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [21]:
df.replace(0, np.nan, inplace=True)

In [22]:
df.dropna(thresh=(0.05*df.shape[1]), axis=0, inplace=True)

In [23]:
df.replace(np.nan, 0, inplace=True)

In [24]:
df.shape

(18611, 57)

In [25]:
df.head()

Unnamed: 0,Universal_Human_Reference,H1_Cell_Line,H1_BMP4_Derived_Mesendoderm_Cultured_Cells,H1_BMP4_Derived_Trophoblast_Cultured_Cells,H1_Derived_Mesenchymal_Stem_Cells,H1_Derived_Neuronal_Progenitor_Cultured_Cells,hESC_Derived_CD184+_Endoderm_Cultured_Cells,hESC_Derived_CD56+_Ectoderm_Cultured_Cells,hESC_Derived_CD56+_Mesoderm_Cultured_Cells,HUES64_Cell_Line,...,A549,GM12878,HELA,HEPG2,HMEC,HSMM,HUVEC,K562,NHEK,NHLF
TSPAN6,23.265,43.985,37.413,29.459,21.864,55.649,52.94,71.629,61.292,44.28,...,37.989,0.038,42.639,49.983,11.554,11.847,43.723,0.267,13.758,15.818
TNMD,0.872,1.642,6.498,0.0,0.157,0.003,0.115,0.087,0.055,1.577,...,0.0,0.0,0.0,0.0,0.0,0.018,0.0,0.006,0.0,0.0
DPM1,55.208,35.259,58.308,48.208,37.477,45.923,44.959,40.438,41.97,51.515,...,52.215,79.197,107.098,62.811,42.386,54.869,16.652,73.719,56.578,56.371
SCYL3,3.237,2.596,2.345,8.775,2.723,3.7,3.912,5.011,4.158,3.292,...,4.829,11.082,8.814,2.646,2.483,2.527,2.549,7.651,4.967,3.714
C1orf112,7.299,6.649,7.838,7.324,0.83,5.354,5.94,5.704,6.213,7.551,...,8.001,13.743,25.369,3.373,4.646,2.179,4.099,22.103,3.29,2.491


# Normalize Matrix (Quantile Normalize the matrix for the columns)

In [26]:
normalized_matrix = uf.quantileNormalize(df)

Step 2/2 progress: 100%  57 Out of 57   

# Nomalize Matrix (z-score the rows)

In [27]:
genes = normalized_matrix.index.tolist()

In [28]:
normalized_matrix.reset_index(inplace=True)

In [29]:
normalized_matrix.drop('index', axis=1, inplace=True)

In [30]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  18611 Out of 18611   

In [31]:
normalized_matrix.index = genes

In [32]:
normalized_matrix.head()

Unnamed: 0,Universal_Human_Reference,H1_Cell_Line,H1_BMP4_Derived_Mesendoderm_Cultured_Cells,H1_BMP4_Derived_Trophoblast_Cultured_Cells,H1_Derived_Mesenchymal_Stem_Cells,H1_Derived_Neuronal_Progenitor_Cultured_Cells,hESC_Derived_CD184+_Endoderm_Cultured_Cells,hESC_Derived_CD56+_Ectoderm_Cultured_Cells,hESC_Derived_CD56+_Mesoderm_Cultured_Cells,HUES64_Cell_Line,...,A549,GM12878,HELA,HEPG2,HMEC,HSMM,HUVEC,K562,NHEK,NHLF
TSPAN6,-0.079233,0.892314,0.492388,0.258057,-0.032271,1.398961,1.64851,2.897362,1.86081,0.916236,...,0.654322,-1.037386,0.765512,1.048494,-0.504552,-0.463583,0.762505,-0.989664,-0.414287,-0.258611
TNMD,-0.038999,0.654039,4.12981,-0.269031,0.1142,-0.262394,-0.194249,-0.196706,-0.228757,0.612396,...,-0.269031,-0.269031,-0.269031,-0.269031,-0.269031,-0.194872,-0.269031,-0.185143,-0.269031,-0.269031
DPM1,0.888519,-0.355449,0.644345,0.342505,-0.248997,0.210393,0.398038,0.145749,0.156504,0.56815,...,0.715471,2.133007,3.642808,1.009386,-0.018603,0.893172,-1.356975,1.37555,0.804159,1.046974
SCYL3,-1.055861,-0.88157,-0.737592,1.578008,-0.497616,-0.579914,-0.698269,-0.291101,-0.482602,-0.704939,...,0.108874,2.121345,1.55956,-0.53086,-0.591767,-0.528514,-0.479193,1.134597,0.271785,-0.019679
C1orf112,0.291057,0.520531,0.883388,0.810647,-0.627582,0.103519,0.117435,0.10418,0.294996,0.623741,...,0.941973,2.075246,4.584405,-0.021695,0.296176,-0.262316,0.229118,3.68731,-0.054294,-0.160254


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [33]:
uf.mapgenesymbols(normalized_matrix)

Progeres: 97%  18174 Out of 18611   

In [34]:
normalized_matrix.shape

# Merge Duplicate Genes By Rows

In [35]:
normalized_matrix = uf.merge(normalized_matrix, 'row', 'mean')

In [36]:
normalized_matrix.shape

# Save Filtered Matrix

In [37]:
filename = '~/./Documents/Harmonizome/Roadmap Epigenomics/Cell and Tissue Gene Expression Profiles/Output/roadmap_cell_tissue_exprs_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [38]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  57 Out of 57   

In [39]:
tertiary_matrix.head()

Unnamed: 0_level_0,Universal_Human_Reference,H1_Cell_Line,H1_BMP4_Derived_Mesendoderm_Cultured_Cells,H1_BMP4_Derived_Trophoblast_Cultured_Cells,H1_Derived_Mesenchymal_Stem_Cells,H1_Derived_Neuronal_Progenitor_Cultured_Cells,hESC_Derived_CD184+_Endoderm_Cultured_Cells,hESC_Derived_CD56+_Ectoderm_Cultured_Cells,hESC_Derived_CD56+_Mesoderm_Cultured_Cells,HUES64_Cell_Line,...,A549,GM12878,HELA,HEPG2,HMEC,HSMM,HUVEC,K562,NHEK,NHLF
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Teriary Matrix

In [40]:
filename = '~/./Documents/Harmonizome/Roadmap Epigenomics/Cell and Tissue Gene Expression Profiles/Output/roadmap_cell_tissue_exprs_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Path to output files

In [41]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Roadmap Epigenomics/Cell and Tissue Gene Expression Profiles/Output/'

# Create Up Gene Set Library

In [42]:
name = 'roadmap_cell_tissue_exprs_gene_up_set'

In [43]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  57 Out of 57   

# Create Down Gene Set Librarym

In [45]:
name = 'roadmap_cell_tissue_exprs_gene_down_set'

In [46]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  57 Out of 57   

# Create Up Attribute Library

In [47]:
name = 'roadmap_cell_tissue_exprs_attribute_up_set'

In [48]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  18375 Out of 18375   

# Create Down Attribute Library

In [49]:
name = 'roadmap_cell_tissue_exprs_attribute_down_set'

In [50]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  18375 Out of 18375   

# Create Gene Similarity Matrix

In [51]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [52]:
gene_similarity_matix.head()

index,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.691241,0.125994,-0.048675,-0.068206,0.047948,-0.111441,-0.204221,0.875715,-0.044119,...,-0.112887,-0.128883,-0.086341,-0.034003,-0.02549,-0.037946,0.009488,-0.074179,-0.129952,-0.088853
A1CF,0.691241,1.0,0.540097,-0.086075,-0.115326,0.129561,-0.172694,-0.181805,0.789591,-0.048863,...,-0.195371,-0.16076,-0.152881,-0.064731,0.006462,-0.108506,-0.098075,-0.203065,-0.008821,-0.043356
A2M,0.125994,0.540097,1.0,-0.003579,-0.000341,-0.057068,0.016745,0.06656,0.035922,0.011503,...,-0.124086,-0.089054,-0.098924,-0.016534,0.002837,-0.084878,-0.062511,-0.123158,-0.118979,-0.115795
A2ML1,-0.048675,-0.086075,-0.003579,1.0,0.116325,-0.004005,-0.090414,0.099451,-0.06435,0.298194,...,-0.083903,-0.136064,-0.079984,-0.046131,-0.061148,-0.049803,-0.085121,0.006165,-0.076976,-0.066183
A4GALT,-0.068206,-0.115326,-0.000341,0.116325,1.0,-0.045116,-0.129729,-0.171552,-0.07915,0.000999,...,-0.222939,-0.2522,-0.225277,-0.150381,-0.201541,-0.130433,0.178767,-0.078619,-0.083887,-0.388136


# Save Gene Similarity Matrix

In [53]:
filename = 'Output/roadmap_cell_tissue_exprs_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [54]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [55]:
attribute_similarity_matix.head()

Unnamed: 0,Universal_Human_Reference,H1_Cell_Line,H1_BMP4_Derived_Mesendoderm_Cultured_Cells,H1_BMP4_Derived_Trophoblast_Cultured_Cells,H1_Derived_Mesenchymal_Stem_Cells,H1_Derived_Neuronal_Progenitor_Cultured_Cells,hESC_Derived_CD184+_Endoderm_Cultured_Cells,hESC_Derived_CD56+_Ectoderm_Cultured_Cells,hESC_Derived_CD56+_Mesoderm_Cultured_Cells,HUES64_Cell_Line,...,A549,GM12878,HELA,HEPG2,HMEC,HSMM,HUVEC,K562,NHEK,NHLF
Universal_Human_Reference,1.0,0.176817,0.157081,0.058331,-0.004466,0.08934,0.170116,0.089462,0.160063,0.211038,...,0.032699,-0.070509,0.048443,0.052032,0.023326,-0.045787,-0.046472,0.106502,-0.104257,-0.064457
H1_Cell_Line,0.176817,1.0,0.652925,0.160686,0.039984,0.372756,0.269771,0.313231,0.181871,0.565842,...,0.035841,-0.00655,0.022964,-0.034466,-0.028115,-0.070525,-0.107187,0.050012,-0.057837,-0.077274
H1_BMP4_Derived_Mesendoderm_Cultured_Cells,0.157081,0.652925,1.0,0.159328,-0.001405,0.355112,0.298468,0.319936,0.16244,0.62133,...,0.10935,0.074946,0.118134,-0.078856,-0.007596,-0.012206,-0.159887,0.110149,-0.010004,-0.031689
H1_BMP4_Derived_Trophoblast_Cultured_Cells,0.058331,0.160686,0.159328,1.0,0.179558,0.112058,0.137281,0.077146,0.275479,0.091925,...,-0.00673,-0.066133,-0.04838,-0.034563,0.004861,0.047519,-0.047224,-0.070104,0.028346,0.040142
H1_Derived_Mesenchymal_Stem_Cells,-0.004466,0.039984,-0.001405,0.179558,1.0,0.027257,0.009819,-0.062578,0.279893,-0.071104,...,0.041281,-0.088884,-0.063631,-0.060294,0.005184,0.211069,0.05647,-0.120232,0.051175,0.389305


# Save Attribute Similarity Matrix

In [56]:
filename = 'Output/roadmap_cell_tissue_exprs_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [57]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  18375 Out of 18375   

In [58]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A4GALT,53947


In [59]:
gene_list.shape

(18375, 2)

# Save Gene List

In [60]:
filename = 'Output/roadmap_cell_tissue_exprs_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [61]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [62]:
attribute_list.head()

Unnamed: 0,Attributes
0,Universal_Human_Reference
1,H1_Cell_Line
2,H1_BMP4_Derived_Mesendoderm_Cultured_Cells
3,H1_BMP4_Derived_Trophoblast_Cultured_Cells
4,H1_Derived_Mesenchymal_Stem_Cells


In [63]:
attribute_list.shape

(57, 1)

# Save Attribute List

In [64]:
filename = 'Output/roadmap_cell_tissue_exprs_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [65]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Roadmap Epigenomics/Cell and Tissue Gene Expression Profiles/Output/'

In [66]:
name = 'roadmap_cell_tissue_exprs_gene_attribute_edge_list'

In [67]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  57 Out of 57   

 The number of statisticaly relevent gene-attribute associations is: 209475
