# ARCHS4 (Human Tissue and Celline)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
from collections import Counter
import json
import re
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/my_functions.py'>

# Download the data using r scripts provided by ARCHS4

In [3]:
import h5py
#filename = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Input/Human/Tissue/human_matrix_download.h5'
filename = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Input/Human/Tissue/human_matrix.h5'
f = h5py.File(filename, 'r')

# # List all groups
# print("Keys: %s" % list(f.keys()))
# a_group_key = f.attrs.keys()

In [4]:
#list(f['data'].keys())

In [5]:
#list(f['meta'].keys())

In [6]:
#test = f['meta/Sample_title'][:].tolist()

# Load Data 

In [7]:
matrix = pd.DataFrame(data = f['data/expression'][:,:])

In [8]:
matrix = matrix.T

In [9]:
matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51969,51970,51971,51972,51973,51974,51975,51976,51977,51978
0,124,0,0,0,108,0,0,0,0,0,...,160,11,3,28,31,33,70,27,21,85
1,77,0,0,0,56,0,0,0,0,0,...,7,0,0,1,0,0,4,4,2,0
2,10743,189,1518,1873,26007,835,1077,12724,545,1100,...,4,0,0,0,0,0,4,0,0,0
3,28,6,0,0,20,0,0,0,0,0,...,334,3,3,8,24,17,23,10,25,0
4,119,0,0,0,69,0,0,0,0,0,...,11,0,0,0,0,0,0,0,0,0


In [10]:
matrix.shape

(35238, 51979)

# Load Sample Meta Data

In [11]:
sample_meta = f['meta/tissue'][:].tolist()

In [12]:
lst = []
for sample in sample_meta:
    lst.append(sample.decode("utf-8"))

sample_meta = lst

In [13]:
sample_meta[0:10]

['HeLa ELAVL1/HuR siRNA1 5d',
 'brain',
 'heart',
 'lymph node',
 'HeLa mock knockdown 5d',
 'ovary',
 'kidney',
 'liver',
 'thyroid',
 'breast']

# Get index of Tissue Data

In [14]:
tissues = pd.read_excel('/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Input/Human/Tissue/tissues_expanded.xlsx')

In [15]:
tissues.head(20)

Unnamed: 0,cell_type,synonyms,not,exact
0,CARDIAC MUSCLE FIBER,CARDIOMYOCYTE|CARDIAC MUSCLE CELL|CARDIAC MYOCYTE,,
1,PERICARDIUM,PERICARDIAL SAC,,
2,VENTRICLE,,,
3,ATRIUM,ATRIAL TISSUE|CARDIAC ATRIUM,,
4,VALVE,,,
5,HEART (BULK TISSUE),HEART,,
6,CHONDROCYTE,CARTILAGE CELL,,
7,PREADIPOCYTE,,,
8,ADIPOSE (BULK TISSUE),ADIPOSE TISSUE|ADIPOSE CONTROL,,
9,BONE MARROW (BULK TISSUE),BONE MARROW,PLASMA|MONONUCLEAR|MESENCHYMAL,


In [16]:
tissues.shape

(112, 4)

# Map Sample Meta Data to Samples

In [17]:
matrix.columns = sample_meta

In [18]:
matrix.head()

Unnamed: 0,HeLa ELAVL1/HuR siRNA1 5d,brain,heart,lymph node,HeLa mock knockdown 5d,ovary,kidney,liver,thyroid,breast,...,undifferentiated hESC (H9 line) feeder free,BC05_Single-cell RNA-seq,BC05_Single-cell RNA-seq.1,BC05_Single-cell RNA-seq.2,BC05_Single-cell RNA-seq.3,BC05_Single-cell RNA-seq.4,BC05_Single-cell RNA-seq.5,BC05_Single-cell RNA-seq.6,BC05_Single-cell RNA-seq.7,BC05_Single-cell RNA-seq.8
0,124,0,0,0,108,0,0,0,0,0,...,160,11,3,28,31,33,70,27,21,85
1,77,0,0,0,56,0,0,0,0,0,...,7,0,0,1,0,0,4,4,2,0
2,10743,189,1518,1873,26007,835,1077,12724,545,1100,...,4,0,0,0,0,0,4,0,0,0
3,28,6,0,0,20,0,0,0,0,0,...,334,3,3,8,24,17,23,10,25,0
4,119,0,0,0,69,0,0,0,0,0,...,11,0,0,0,0,0,0,0,0,0


### Create list of tissue sample locations within matrix

In [19]:
tissue_loc_list = []
tissue_list = []


for i,sample in enumerate(matrix.columns):
    
    progressPercent = ((i+1)/len(matrix.columns))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(matrix.columns)))
    sys.stdout.flush()
    
    
    
    for index in tissues.index:
        
        if type(tissues.ix[index,'exact']) != float and sample.lower() == tissues.ix[index,'exact'].lower():
                tissue_loc_list.append(i)
                tissue_list.append(tissues.ix[index,'cell_type'])
        
        elif re.search(tissues.ix[index,'cell_type'].lower(), sample.lower()):
            if type(tissues.ix[index,'not']) != float:
                excludes = tissues.ix[index,'not'].split('|')
                for bad_term in excludes:
                    if re.search(bad_term.lower(), sample.lower()):
                        break
                    elif bad_term == excludes[-1]:
                        tissue_loc_list.append(i)
                        tissue_list.append(tissues.ix[index,'cell_type'])
            else:
                tissue_loc_list.append(i)
                tissue_list.append(tissues.ix[index,'cell_type'])

        else:
            if type(tissues.ix[index,'synonyms']) != float:
                for synonym in tissues.ix[index,'synonyms'].split('|'):
                    if re.search(synonym.lower(), sample.lower()):
                        if type(tissues.ix[index,'not']) != float:
                            excludes = tissues.ix[index,'not'].split('|')
                            for bad_term in excludes:
                                if re.search(bad_term.lower(), sample.lower()):
                                    break
                                elif bad_term == excludes[-1]:
                                    tissue_loc_list.append(i)
                                    tissue_list.append(tissues.ix[index,'cell_type'])
                        else:
                            tissue_loc_list.append(i)
                            tissue_list.append(tissues.ix[index,'cell_type'])

Progress: 100%  51979 Out of 51979   

In [20]:
len(tissue_loc_list)

19020

In [21]:
len(tissue_list)

19020

In [22]:
len(set(tissue_list))

108

In [23]:
tissue_loc_list[0:10]

[1, 2, 5, 6, 7, 8, 18, 20, 21, 22]

In [24]:
tissue_list[0:10]

['BRAIN (BULK)',
 'HEART (BULK TISSUE)',
 'OVARY (BULK TISSUE)',
 'KIDNEY (BULK TISSUE)',
 'LIVER (BULK TISSUE)',
 'THYROID (BULK TISSUE)',
 'SKELETAL MUSCLE (BULK TISSUE)',
 'ADRENAL GLAND',
 'LUNG (BULK TISSUE)',
 'BLASTOCYST']

In [25]:
for tissue in tissues['cell_type']:
    if tissue not in tissue_list:
        print(tissue)

BCELL PRECURSOR
BLOOD PRECDCS
CD8+ T CELL
PLACENTA VILLOUS TISSUE


In [26]:
matrix_tissue = matrix[tissue_loc_list].copy()

In [27]:
matrix_tissue.columns = tissue_list

In [28]:
matrix_tissue.shape

(35238, 19020)

In [29]:
matrix_tissue = mf.merge(matrix_tissue, 'column', 'mean')

In [30]:
matrix_tissue.head()

Unnamed: 0,ADIPOSE (BULK TISSUE),ADRENAL GLAND,ALPHA CELL,ALVEOLAR CELL TYPE II,ALVEOLAR MACROPHAGE,AMNIOTIC FLUID,ASTROCYTE,ATRIUM,BASAL CELL,BETA CELL,...,SUPERIOR FRONTAL GYRUS,TESTIS (BULK TISSUE),THYMUS (BULK TISSUE),THYROID (BULK TISSUE),TLYMPHOCYTE,TRACHEA (BULK TISSUE),VALVE,VASCULAR SMOOTH MUSCLE,VENTRICLE,WHARTONS JELLY
0,95.740458,199.3,294.846154,542.5,328.5,35.761905,492.454545,408.857143,110.793103,208.157895,...,72.529412,702.5,1241.222222,216.865079,110.78017,621.333333,272.844444,391.75,45.971429,214.0
1,10.885496,68.1,2148.538462,108.0,49.5,187.619048,6.363636,13.0,14.551724,683.421053,...,1.029412,140.5,58.333333,14.738095,4.64647,31.333333,0.222222,2.75,4.238095,10.5
2,32161.129771,17544.95,171.538462,71.0,8844.5,171.095238,1600.363636,14982.714286,150.586207,229.052632,...,1493.294118,7402.0,13944.333333,7449.547619,150.744116,4345.5,8936.711111,1695.375,11068.904762,2.3
3,12.587786,69.05,50.615385,276.5,22.0,9683.666667,17.181818,36.571429,1921.068966,32.368421,...,34.676471,2479.0,2875.111111,36.34127,12.500751,5205.0,11.066667,24.625,10.104762,21.55
4,84.725191,72.0,4.384615,56.5,54.5,3.904762,2.090909,34.142857,1.896552,2.789474,...,11.588235,138.5,203.111111,23.595238,29.155734,136.333333,10.555556,2.625,26.304762,1.3


In [31]:
matrix_tissue.shape

(35238, 108)

# Get Celline Data

In [32]:
human_celline = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Input/Human/Cell_Lines/human_cell_lines.txt', sep='\t')

In [33]:
human_celline.head()

Unnamed: 0,cell_line,species,tissue,num_hits
0,K562,Human,Bone marrow,729.0
1,HELA,Human,Cervix,588.0
2,HEPG2,Human,Liver,552.0
3,HEK293,Human,Kidney,533.0
4,MCF7,Human,Breast/Mammary,377.0


In [34]:
human_celline.shape

(228, 4)

In [35]:
ccle_celline = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Input/Human/Cell_Lines/CCLE_cell_lines.txt', sep='\t')

In [36]:
ccle_celline.head()

Unnamed: 0,cell_line,tissue,num_hits
0,K562,haematopoietic_and_lymphoid_tissue,729
1,MCF7,breast,377
2,HN,upper_aerodigestive_tract,225
3,HT29,large_intestine,101
4,MDAMB231,breast,101


In [37]:
ccle_celline.shape

(109, 3)

In [38]:
list_of_cellines = list(set(human_celline['cell_line'].values.tolist()+ccle_celline['cell_line'].values.tolist()))

In [39]:
list_of_cellines[0:10]

['COLO205',
 'REH',
 'TF1 CELL',
 'U698',
 'KYSE510',
 'HUH7',
 'EBC1',
 '293S',
 'SUPT1',
 'HEP3B']

In [40]:
len(list_of_cellines)

296

### Create list of Cell Line sample locations within matrix

In [41]:
celline_loc_list = []
celline_list = []


for i,sample in enumerate(matrix.columns):
    
    progressPercent = ((i+1)/len(matrix.columns))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(matrix.columns)))
    sys.stdout.flush()
    
    for celline in list_of_cellines:
        if re.search(celline.lower(), sample.lower()):
            celline_loc_list.append(i)
            celline_list.append(celline)

Progress: 100%  51979 Out of 51979   

In [42]:
len(celline_loc_list)

5128

In [43]:
len(celline_list)

5128

In [44]:
celline_loc_list[0:10]

[0, 4, 14, 32, 40, 47, 52, 56, 65, 67]

In [45]:
celline_list[0:10]

['HELA',
 'HELA',
 'HELA',
 'HELA',
 'HELA',
 'HCT116',
 'HEK293',
 'LNCAP',
 'HCT116',
 'NTERA2']

In [46]:
len(set(celline_list))

117

In [47]:
matrix_celline = matrix[celline_loc_list].copy()

In [48]:
matrix_celline.columns = celline_list

In [49]:
matrix_celline.shape

(35238, 5128)

In [50]:
matrix_celline = mf.merge(matrix_celline, 'column', 'mean')

In [51]:
matrix_celline.head()

Unnamed: 0,22RV1,293F,A172,A375,A431,A549,A673,ASPC1,BEAS2B,BEWO,...,SUPT1,T24,T47D,T84,T98G,THP1,U87,U937,VCAP,WSUDLCL2
0,320.181818,143.314286,689.333333,174.672131,131.909091,169.885714,312.322581,90.0,193.75,391.5,...,68.8,41.528736,230.205882,91.0,703.5,454.133333,129.5,41.647059,117.907407,662.0
1,749.181818,9.457143,7.666667,8.081967,7.181818,267.014286,7.483871,2022.5,1.75,3.833333,...,15.5,2.183908,2.5,556.0,1.5,5.0,10.5,3.058824,127.537037,1.8
2,37.909091,6.371429,3031.333333,1146.540984,827.818182,7.185714,1231.129032,0.0,0.25,0.833333,...,10.3,59.011494,1.088235,9.5,5108.5,15473.6,20.0,49.588235,18.111111,6.4
3,37.545455,10.228571,101.333333,10.934426,22.0,25.657143,133.064516,58.5,4.5,190.166667,...,14.4,7.08046,12.617647,19.5,4.5,38.0,24.5,6.705882,17.777778,15.4
4,14.909091,8.228571,0.0,5.311475,0.0,0.971429,0.709677,0.0,0.0,0.166667,...,1.5,0.068966,0.058824,0.5,0.5,1.2,14.5,1.117647,0.740741,8.0


In [52]:
matrix_celline.shape

(35238, 117)

# Get Gene Symbols

In [53]:
gene_list = f['meta/genes'][:].tolist() 

In [54]:
lst = []
for gene in gene_list:
    lst.append(gene.decode("utf-8"))

gene_list = lst

In [55]:
gene_list[0:10]

['A1BG',
 'A1CF',
 'A2M',
 'A2ML1',
 'A2MP1',
 'A3GALT2',
 'A4GALT',
 'A4GNT',
 'AAAS',
 'AACS']

In [56]:
len(gene_list)

35238

# Map Gene Symbols to ID's

In [57]:
matrix_tissue.index = gene_list

In [58]:
matrix_tissue.head()

Unnamed: 0,ADIPOSE (BULK TISSUE),ADRENAL GLAND,ALPHA CELL,ALVEOLAR CELL TYPE II,ALVEOLAR MACROPHAGE,AMNIOTIC FLUID,ASTROCYTE,ATRIUM,BASAL CELL,BETA CELL,...,SUPERIOR FRONTAL GYRUS,TESTIS (BULK TISSUE),THYMUS (BULK TISSUE),THYROID (BULK TISSUE),TLYMPHOCYTE,TRACHEA (BULK TISSUE),VALVE,VASCULAR SMOOTH MUSCLE,VENTRICLE,WHARTONS JELLY
A1BG,95.740458,199.3,294.846154,542.5,328.5,35.761905,492.454545,408.857143,110.793103,208.157895,...,72.529412,702.5,1241.222222,216.865079,110.78017,621.333333,272.844444,391.75,45.971429,214.0
A1CF,10.885496,68.1,2148.538462,108.0,49.5,187.619048,6.363636,13.0,14.551724,683.421053,...,1.029412,140.5,58.333333,14.738095,4.64647,31.333333,0.222222,2.75,4.238095,10.5
A2M,32161.129771,17544.95,171.538462,71.0,8844.5,171.095238,1600.363636,14982.714286,150.586207,229.052632,...,1493.294118,7402.0,13944.333333,7449.547619,150.744116,4345.5,8936.711111,1695.375,11068.904762,2.3
A2ML1,12.587786,69.05,50.615385,276.5,22.0,9683.666667,17.181818,36.571429,1921.068966,32.368421,...,34.676471,2479.0,2875.111111,36.34127,12.500751,5205.0,11.066667,24.625,10.104762,21.55
A2MP1,84.725191,72.0,4.384615,56.5,54.5,3.904762,2.090909,34.142857,1.896552,2.789474,...,11.588235,138.5,203.111111,23.595238,29.155734,136.333333,10.555556,2.625,26.304762,1.3


In [59]:
matrix_tissue.shape

(35238, 108)

In [60]:
matrix_celline.index = gene_list

In [61]:
matrix_celline.head()

Unnamed: 0,22RV1,293F,A172,A375,A431,A549,A673,ASPC1,BEAS2B,BEWO,...,SUPT1,T24,T47D,T84,T98G,THP1,U87,U937,VCAP,WSUDLCL2
A1BG,320.181818,143.314286,689.333333,174.672131,131.909091,169.885714,312.322581,90.0,193.75,391.5,...,68.8,41.528736,230.205882,91.0,703.5,454.133333,129.5,41.647059,117.907407,662.0
A1CF,749.181818,9.457143,7.666667,8.081967,7.181818,267.014286,7.483871,2022.5,1.75,3.833333,...,15.5,2.183908,2.5,556.0,1.5,5.0,10.5,3.058824,127.537037,1.8
A2M,37.909091,6.371429,3031.333333,1146.540984,827.818182,7.185714,1231.129032,0.0,0.25,0.833333,...,10.3,59.011494,1.088235,9.5,5108.5,15473.6,20.0,49.588235,18.111111,6.4
A2ML1,37.545455,10.228571,101.333333,10.934426,22.0,25.657143,133.064516,58.5,4.5,190.166667,...,14.4,7.08046,12.617647,19.5,4.5,38.0,24.5,6.705882,17.777778,15.4
A2MP1,14.909091,8.228571,0.0,5.311475,0.0,0.971429,0.709677,0.0,0.0,0.166667,...,1.5,0.068966,0.058824,0.5,0.5,1.2,14.5,1.117647,0.740741,8.0


In [62]:
matrix_celline.shape

(35238, 117)

# Combine Tissue and Celline Data

In [63]:
matrix = pd.concat([matrix_tissue, matrix_celline], axis=1)

In [64]:
matrix.head()

Unnamed: 0,ADIPOSE (BULK TISSUE),ADRENAL GLAND,ALPHA CELL,ALVEOLAR CELL TYPE II,ALVEOLAR MACROPHAGE,AMNIOTIC FLUID,ASTROCYTE,ATRIUM,BASAL CELL,BETA CELL,...,SUPT1,T24,T47D,T84,T98G,THP1,U87,U937,VCAP,WSUDLCL2
A1BG,95.740458,199.3,294.846154,542.5,328.5,35.761905,492.454545,408.857143,110.793103,208.157895,...,68.8,41.528736,230.205882,91.0,703.5,454.133333,129.5,41.647059,117.907407,662.0
A1CF,10.885496,68.1,2148.538462,108.0,49.5,187.619048,6.363636,13.0,14.551724,683.421053,...,15.5,2.183908,2.5,556.0,1.5,5.0,10.5,3.058824,127.537037,1.8
A2M,32161.129771,17544.95,171.538462,71.0,8844.5,171.095238,1600.363636,14982.714286,150.586207,229.052632,...,10.3,59.011494,1.088235,9.5,5108.5,15473.6,20.0,49.588235,18.111111,6.4
A2ML1,12.587786,69.05,50.615385,276.5,22.0,9683.666667,17.181818,36.571429,1921.068966,32.368421,...,14.4,7.08046,12.617647,19.5,4.5,38.0,24.5,6.705882,17.777778,15.4
A2MP1,84.725191,72.0,4.384615,56.5,54.5,3.904762,2.090909,34.142857,1.896552,2.789474,...,1.5,0.068966,0.058824,0.5,0.5,1.2,14.5,1.117647,0.740741,8.0


In [65]:
matrix.shape

(35238, 225)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [66]:
mf.mapgenesymbols(matrix)

Progeres: 100%  35238 Out of 35238   

In [67]:
matrix.shape

(26989, 225)

# Merge Duplicate Genes By Rows

In [68]:
matrix = mf.merge(matrix, 'row', 'mean')

In [69]:
matrix.shape

(26883, 225)

# Save Unfiltered Matrix To File

In [70]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [71]:
matrix.replace(0, np.nan, inplace=True)

In [72]:
matrix.shape

(26883, 225)

In [73]:
matrix.dropna(thresh=(0.05*matrix.shape[1]), axis=0, inplace=True)

In [74]:
matrix.replace(np.nan, 0, inplace=True)

In [75]:
matrix.shape

(26845, 225)

# Normalize Matrix (Quantile Normalize the matrix for the columns)

In [76]:
normalized_matrix = mf.quantileNormalize(matrix)

Step 2/2 progress: 100%  225 Out of 225   

In [77]:
normalized_matrix.head()

Unnamed: 0_level_0,ADIPOSE (BULK TISSUE),ADRENAL GLAND,ALPHA CELL,ALVEOLAR CELL TYPE II,ALVEOLAR MACROPHAGE,AMNIOTIC FLUID,ASTROCYTE,ATRIUM,BASAL CELL,BETA CELL,...,SUPT1,T24,T47D,T84,T98G,THP1,U87,U937,VCAP,WSUDLCL2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,77.639452,114.235621,800.132994,955.286048,238.406309,53.516273,325.612017,237.298777,74.47672,381.848632,...,498.935955,843.761224,336.050291,208.05062,422.11519,403.362228,1120.536579,240.213623,154.713879,549.742871
A1CF,10.632456,31.834606,7119.267513,55.739348,36.396718,603.393438,7.531154,8.45934,13.034768,1918.543828,...,108.887595,29.256328,8.18546,1246.120016,4.584466,7.523892,46.577809,22.040011,166.233771,4.617449
A2M,35382.113857,26385.461129,394.017068,26.184292,3989.424678,541.15633,1190.490916,12383.461054,99.098079,447.32601,...,63.932298,1177.819667,3.892942,20.556971,2874.092503,13897.8092,108.631229,267.874961,25.789261,16.457042
A2ML1,12.08404,32.279124,57.979952,362.038656,16.264354,25645.115726,16.583468,20.282336,1311.834669,21.484428,...,99.21832,114.442516,33.339891,44.6788,12.027306,47.350926,144.080518,48.902087,25.284724,40.618978
A2MP1,68.617861,33.831503,3.011133,17.372489,39.958297,4.949708,3.271094,19.210936,2.123901,2.173348,...,6.078565,2.272334,0.253718,0.88493,1.462491,1.99835,69.118073,8.627351,0.959753,20.945156


# Nomalize Matrix (z-score the rows)

In [78]:
mf.zscore(normalized_matrix, 'row')

Progress: 100%  26845 Out of 26845   

In [79]:
normalized_matrix.head()

Unnamed: 0_level_0,ADIPOSE (BULK TISSUE),ADRENAL GLAND,ALPHA CELL,ALVEOLAR CELL TYPE II,ALVEOLAR MACROPHAGE,AMNIOTIC FLUID,ASTROCYTE,ATRIUM,BASAL CELL,BETA CELL,...,SUPT1,T24,T47D,T84,T98G,THP1,U87,U937,VCAP,WSUDLCL2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.200403,-0.185935,0.085238,0.146578,-0.136843,-0.20994,-0.102366,-0.137281,-0.201653,-0.080133,...,-0.033842,0.102486,-0.098239,-0.148844,-0.064213,-0.071627,0.21191,-0.136129,-0.169931,-0.013755
A1CF,-0.236927,-0.218805,5.838843,-0.198374,-0.214906,0.269708,-0.239577,-0.238784,-0.234873,1.39377,...,-0.152948,-0.221009,-0.239018,0.819048,-0.242096,-0.239584,-0.206204,-0.227177,-0.103934,-0.242068
A2M,1.738118,1.192424,-0.384093,-0.406404,-0.166013,-0.375169,-0.335783,0.343129,-0.401982,-0.38086,...,-0.404115,-0.336552,-0.407756,-0.406746,-0.233664,0.434982,-0.401404,-0.391745,-0.406428,-0.406994
A2ML1,-0.155404,-0.150269,-0.143734,-0.066424,-0.154341,6.362096,-0.15426,-0.153319,0.175073,-0.153013,...,-0.133249,-0.129378,-0.149999,-0.147116,-0.155418,-0.146437,-0.121842,-0.146042,-0.152047,-0.148148
A2MP1,0.267999,-0.010336,-0.256938,-0.142029,0.038686,-0.241427,-0.254858,-0.127319,-0.264037,-0.263641,...,-0.232394,-0.262849,-0.279001,-0.27395,-0.269329,-0.265041,0.272002,-0.212001,-0.273351,-0.113443


# Save Filtered Matrix

In [80]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [81]:
tertiary_matrix = mf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  225 Out of 225   

In [82]:
tertiary_matrix.head()

Unnamed: 0_level_0,ADIPOSE (BULK TISSUE),ADRENAL GLAND,ALPHA CELL,ALVEOLAR CELL TYPE II,ALVEOLAR MACROPHAGE,AMNIOTIC FLUID,ASTROCYTE,ATRIUM,BASAL CELL,BETA CELL,...,SUPT1,T24,T47D,T84,T98G,THP1,U87,U937,VCAP,WSUDLCL2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2MP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Teriary Matrix

In [83]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Path to output files

In [84]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

# Create Up Gene Set Library

In [85]:
name = 'archs4_gene_up_set'

In [86]:
mf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  225 Out of 225   

# Create Down Gene Set Library

In [87]:
name = 'archs4_gene_down_set'

In [88]:
mf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  225 Out of 225   

# Create Up Attribute Library

In [89]:
name = 'archs4_attribute_up_set'

In [90]:
mf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  26845 Out of 26845   

# Create Down Attribute Library

In [91]:
name = 'archs4_attribute_down_set'

In [92]:
mf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  26845 Out of 26845   

# Create Gene Similarity Matrix

In [93]:
gene_similarity_matix = mf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [94]:
gene_similarity_matix.head()

index,A1BG,A1CF,A2M,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11AP1,ZYG11B,ZYX,ZYXP1,ZZEF1,ZZZ3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.673029,0.267123,-0.028901,0.022421,-0.019108,-0.052057,0.065376,-0.01319,-0.057218,...,-0.065518,0.014156,0.107607,-0.033926,-0.021491,-0.010341,0.003938,0.106816,-0.033943,-0.103793
A1CF,0.673029,1.0,0.355726,-0.018799,-0.014312,-0.047752,-0.115841,0.053403,-0.101031,0.151921,...,-0.092944,0.032962,0.122776,-0.008717,0.038415,-0.047353,-0.079557,0.259536,0.09403,-0.038789
A2M,0.267123,0.355726,1.0,-0.027291,0.027744,-0.059195,0.020771,0.094126,-0.110816,-0.049047,...,-0.134941,-0.06073,-0.029731,-0.181712,-0.049343,0.061969,0.007164,0.009681,0.01985,-0.127373
A2ML1,-0.028901,-0.018799,-0.027291,1.0,-0.010264,-0.033137,0.038865,-0.05569,-0.090654,0.046746,...,-0.021412,0.004219,-0.013889,-0.05063,-0.00222,-0.008675,-0.058092,-0.007483,0.006554,0.029288
A2MP1,0.022421,-0.014312,0.027744,-0.010264,1.0,0.036623,0.198878,-0.057125,-0.004515,0.090587,...,-0.012984,-0.024966,0.077975,0.058259,-0.039648,-0.05551,-0.057325,-0.028254,0.009574,-0.118611


# Save Gene Similarity Matrix 

In [95]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [96]:
attribute_similarity_matix = mf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [97]:
attribute_similarity_matix.head()

Unnamed: 0,ADIPOSE (BULK TISSUE),ADRENAL GLAND,ALPHA CELL,ALVEOLAR CELL TYPE II,ALVEOLAR MACROPHAGE,AMNIOTIC FLUID,ASTROCYTE,ATRIUM,BASAL CELL,BETA CELL,...,SUPT1,T24,T47D,T84,T98G,THP1,U87,U937,VCAP,WSUDLCL2
ADIPOSE (BULK TISSUE),1.0,0.181116,-0.012809,-0.049156,0.016841,0.020332,0.131782,0.270084,-0.001029,0.032353,...,-0.18298,-0.014943,0.004004,-0.098564,0.000532,0.007966,0.003161,-0.080062,-0.033579,-0.124675
ADRENAL GLAND,0.181116,1.0,0.079787,0.005419,0.040103,0.02335,0.070108,0.18067,0.001888,0.137787,...,-0.090161,-0.017692,-0.004019,-0.0102,-0.046753,-0.018846,0.017805,-0.062047,0.034267,-0.116561
ALPHA CELL,-0.012809,0.079787,1.0,0.148023,0.070121,0.144207,-0.050368,0.015756,-0.002505,0.593797,...,0.040008,0.02322,-0.066736,0.045348,-0.141121,-0.074331,0.017564,-0.06533,0.083968,-0.103058
ALVEOLAR CELL TYPE II,-0.049156,0.005419,0.148023,1.0,0.063669,0.103728,-0.090488,-0.017467,0.115423,0.08331,...,0.083155,0.046907,-0.079958,0.085437,-0.122482,-0.060791,0.040186,-0.011677,-0.017983,-0.126353
ALVEOLAR MACROPHAGE,0.016841,0.040103,0.070121,0.063669,1.0,0.027223,-0.074105,-0.029941,0.027833,0.027072,...,0.026119,-0.004203,-0.085371,0.102548,-0.109503,0.134105,-0.024324,0.164849,-0.061785,-0.038395


# Save Attribute Similarity Matrix

In [98]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [99]:
gene_list = mf.createGeneList(normalized_matrix)

Progeres: 100%  26845 Out of 26845   

In [100]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A2MP1,3


In [101]:
gene_list.shape

(26845, 2)

# Save Gene List

In [102]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [103]:
attribute_list = mf.createAttributeList(normalized_matrix)

In [104]:
attribute_list.head()

Unnamed: 0,Attributes
0,ADIPOSE (BULK TISSUE)
1,ADRENAL GLAND
2,ALPHA CELL
3,ALVEOLAR CELL TYPE II
4,ALVEOLAR MACROPHAGE


In [105]:
attribute_list.shape

(225, 1)

# Save Attribute List

In [106]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [107]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [108]:
name = 'archs4_gene_attribute_edge_list'

In [109]:
mf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  225 Out of 225   

 The number of statisticaly relevent gene-attribute associations is: 1208025
