# ARCHS4 (Human Cell Lines)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
from collections import Counter
import json
import re
import scipy
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/untility_functions.py'>

# Download the data using r scripts provided by ARCHS4

In [3]:
import h5py

filename = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Input/Human/Tissue/human_matrix.h5'
f = h5py.File(filename, 'r')

# Load Data 

In [4]:
matrix = pd.DataFrame(data = f['data/expression'][:,:])

In [5]:
matrix = matrix.T

In [6]:
matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65419,65420,65421,65422,65423,65424,65425,65426,65427,65428
0,124,0,0,0,108,0,0,0,0,0,...,82,81,86,133,597,808,458,435,766,525
1,77,0,0,0,56,0,0,0,0,0,...,7,6,5,4,3,11,6,7,14,8
2,10743,189,1518,1873,26007,835,1077,12724,545,1100,...,234,8514,320,13332,41,42,45,30,73,25
3,28,6,0,0,20,0,0,0,0,0,...,13,12,24,15,10,10,5,7,15,9
4,119,0,0,0,69,0,0,0,0,0,...,13,15,23,14,2,1,1,0,0,1


In [7]:
matrix.shape

(35238, 65429)

# Load Sample Meta Data

In [8]:
sample_meta = f['meta/Sample_source_name_ch1'][:].tolist()

In [9]:
lst = []
for sample in sample_meta:
    lst.append(sample.decode("utf-8"))

sample_meta = lst

In [10]:
sample_meta[0:10]

['HeLa ELAVL1/HuR siRNA1 5d',
 'brain',
 'heart',
 'lymph node',
 'HeLa mock knockdown 5d',
 'ovary',
 'kidney',
 'liver',
 'thyroid',
 'breast']

# Get Cell Line Data

In [11]:
human_celline = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Input/Human/Cell_Lines/human_cell_lines.txt', sep='\t')

In [12]:
human_celline.head()

Unnamed: 0,cell_line,species,tissue,num_hits
0,K562,Human,Bone marrow,729.0
1,HELA,Human,Cervix,588.0
2,HEPG2,Human,Liver,552.0
3,HEK293,Human,Kidney,533.0
4,MCF7,Human,Breast/Mammary,377.0


In [13]:
human_celline.shape

(228, 4)

In [14]:
ccle_celline = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Input/Human/Cell_Lines/CCLE_cell_lines.txt', sep='\t')

In [15]:
ccle_celline.head()

Unnamed: 0,cell_line,tissue,num_hits
0,K562,haematopoietic_and_lymphoid_tissue,729
1,MCF7,breast,377
2,HN,upper_aerodigestive_tract,225
3,HT29,large_intestine,101
4,MDAMB231,breast,101


In [16]:
ccle_celline.shape

(109, 3)

In [17]:
list_of_cellines = list(set(human_celline['cell_line'].values.tolist()+ccle_celline['cell_line'].values.tolist()))

In [18]:
list_of_cellines[0:10]

['NHFF',
 'HT4',
 'VCAP',
 '9HTE',
 'HEK293',
 'RPMI8226',
 'KC18240 CELLS',
 'MONOMAC6 ',
 'NCIH1975',
 'RKO']

In [19]:
len(list_of_cellines)

296

# Create list of Cell Line sample locations within matrix

In [20]:
matrix.columns = sample_meta

In [21]:
celline_loc_list = []
celline_list = []


for i,sample in enumerate(matrix.columns):
    
    progressPercent = ((i+1)/len(matrix.columns))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(matrix.columns)))
    sys.stdout.flush()
    
    for celline in list_of_cellines:
        if re.search(celline.lower(), sample.lower()):
            celline_loc_list.append(i)
            celline_list.append(celline)

Progress: 99%  65398 Out of 65429   

In [22]:
matrix = matrix[celline_loc_list].copy()

In [23]:
matrix.columns = celline_list

In [24]:
matrix.head()

Unnamed: 0,HELA,HELA.1,HELA.2,HELA.3,HELA.4,HCT116,HEK293,LNCAP,HCT116.1,NTERA2,...,HEK293.1,HEK293.2,HEK293.3,HEK293.4,BT549,BT549.1,BT549.2,BT549.3,BT549.4,BT549.5
0,124,108,86,18,60,0,33,126,22,132,...,969,914,973,1206,158,178,174,158,162,167
1,77,56,57,4,7,0,1,6,16,6,...,62,53,61,76,1,2,3,1,4,3
2,10743,26007,12424,18219,670,0,0,4,329,96,...,58,24,25,45,0,0,0,0,0,0
3,28,20,19,2,19,2,7,14,7,53,...,103,66,99,91,1,2,4,1,2,6
4,119,69,64,87,8,0,0,0,1,3,...,24,23,39,32,0,0,0,0,0,0


In [25]:
matrix.shape

(35238, 5937)

# Get Gene Symbols

In [26]:
gene_list = f['meta/genes'][:].tolist() 

In [27]:
lst = []
for gene in gene_list:
    lst.append(gene.decode("utf-8"))

gene_list = lst

In [28]:
gene_list[0:10]

['A1BG',
 'A1CF',
 'A2M',
 'A2ML1',
 'A2MP1',
 'A3GALT2',
 'A4GALT',
 'A4GNT',
 'AAAS',
 'AACS']

# Map Gene Symbols to ID's

In [29]:
matrix.index = gene_list

In [30]:
matrix.head()

Unnamed: 0,HELA,HELA.1,HELA.2,HELA.3,HELA.4,HCT116,HEK293,LNCAP,HCT116.1,NTERA2,...,HEK293.1,HEK293.2,HEK293.3,HEK293.4,BT549,BT549.1,BT549.2,BT549.3,BT549.4,BT549.5
A1BG,124,108,86,18,60,0,33,126,22,132,...,969,914,973,1206,158,178,174,158,162,167
A1CF,77,56,57,4,7,0,1,6,16,6,...,62,53,61,76,1,2,3,1,4,3
A2M,10743,26007,12424,18219,670,0,0,4,329,96,...,58,24,25,45,0,0,0,0,0,0
A2ML1,28,20,19,2,19,2,7,14,7,53,...,103,66,99,91,1,2,4,1,2,6
A2MP1,119,69,64,87,8,0,0,0,1,3,...,24,23,39,32,0,0,0,0,0,0


# Save Unfiltered Matrix To File

In [31]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_celline_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

In [32]:
normalized_matrix = matrix.copy()

# Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [33]:
normalized_matrix.replace(0, np.nan, inplace=True)

In [34]:
normalized_matrix.dropna(thresh=(0.05*normalized_matrix.shape[1]), axis=0, inplace=True)

In [35]:
normalized_matrix.replace(np.nan, 0, inplace=True)

In [36]:
normalized_matrix.shape

(29839, 5937)

In [37]:
normalized_matrix.head()

Unnamed: 0,HELA,HELA.1,HELA.2,HELA.3,HELA.4,HCT116,HEK293,LNCAP,HCT116.1,NTERA2,...,HEK293.1,HEK293.2,HEK293.3,HEK293.4,BT549,BT549.1,BT549.2,BT549.3,BT549.4,BT549.5
A1BG,124.0,108.0,86.0,18.0,60.0,0.0,33.0,126.0,22.0,132.0,...,969.0,914.0,973.0,1206.0,158.0,178.0,174.0,158.0,162.0,167.0
A1CF,77.0,56.0,57.0,4.0,7.0,0.0,1.0,6.0,16.0,6.0,...,62.0,53.0,61.0,76.0,1.0,2.0,3.0,1.0,4.0,3.0
A2M,10743.0,26007.0,12424.0,18219.0,670.0,0.0,0.0,4.0,329.0,96.0,...,58.0,24.0,25.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,28.0,20.0,19.0,2.0,19.0,2.0,7.0,14.0,7.0,53.0,...,103.0,66.0,99.0,91.0,1.0,2.0,4.0,1.0,2.0,6.0
A2MP1,119.0,69.0,64.0,87.0,8.0,0.0,0.0,0.0,1.0,3.0,...,24.0,23.0,39.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0


# Normalize Matrix (Quantile Normalize the matrix for the columns)`

In [38]:
normalized_matrix.columns = range(0, len(normalized_matrix.columns))

In [39]:
normalized_matrix = uf.quantileNormalize(normalized_matrix)

Step 2/2 progress: 100%  5937 Out of 5937   

In [40]:
normalized_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5927,5928,5929,5930,5931,5932,5933,5934,5935,5936
A1BG,46.470608,39.073269,36.20667,32.370558,207.050867,0.0,85.870136,2089.556342,64.902981,573.035203,...,204.582786,242.651507,173.026276,205.15075,445.229746,438.849419,399.059458,441.74701,406.826343,416.266633
A1CF,29.685363,21.602661,25.405424,8.423109,6.575038,0.0,4.112515,10.040256,45.025434,2.118578,...,8.37241,9.357251,7.165067,10.427152,3.347313,6.986862,9.336534,3.272865,13.381843,9.467576
A2M,4050.306889,9518.532761,4494.616641,22473.62321,2584.112515,0.0,0.0,6.374937,571.894728,335.630453,...,7.860872,4.233957,2.917972,6.40694,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,11.184437,7.918983,8.727135,3.961934,25.666667,406.826343,21.578575,35.136096,16.614115,90.839144,...,14.313963,11.702375,11.876874,12.205828,3.347313,6.986862,12.484757,3.272865,6.314637,19.346808
A2MP1,44.692774,26.488967,27.929763,110.613273,7.690079,0.0,0.0,0.0,2.183426,0.821459,...,3.190837,4.061142,4.566785,4.594408,0.0,0.0,0.0,0.0,0.0,0.0


# Nomalize Matrix (z-score the rows)

In [41]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  29839 Out of 29839   

In [42]:
normalized_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5927,5928,5929,5930,5931,5932,5933,5934,5935,5936
A1BG,-0.527902,-0.542683,-0.54841,-0.556075,-0.207047,-0.620755,-0.449178,3.55438,-0.491072,0.524225,...,-0.211979,-0.135914,-0.275032,-0.210844,0.268857,0.256109,0.176605,0.261898,0.192124,0.210986
A1CF,-0.314229,-0.319815,-0.317187,-0.328923,-0.3302,-0.334744,-0.331902,-0.327805,-0.303627,-0.33328,...,-0.328958,-0.328277,-0.329792,-0.327538,-0.332431,-0.329915,-0.328291,-0.332482,-0.325496,-0.328201
A2M,-0.21514,-0.077106,-0.203924,0.249918,-0.252151,-0.317381,-0.317381,-0.31722,-0.302945,-0.308909,...,-0.317183,-0.317274,-0.317307,-0.317219,-0.317381,-0.317381,-0.317381,-0.317381,-0.317381,-0.317381
A2ML1,-0.111086,-0.113976,-0.113261,-0.117478,-0.098271,0.239031,-0.101888,-0.089891,-0.106282,-0.040597,...,-0.108317,-0.110628,-0.110474,-0.110183,-0.118022,-0.114801,-0.109936,-0.118088,-0.115396,-0.103863
A2MP1,0.190662,0.058723,0.069166,0.668445,-0.077529,-0.133265,-0.133265,-0.133265,-0.11744,-0.127312,...,-0.110139,-0.103831,-0.100166,-0.099966,-0.133265,-0.133265,-0.133265,-0.133265,-0.133265,-0.133265


# Merge Duplicate Samples By Columns (by taking the mean)

In [43]:
normalized_matrix.columns = celline_list

In [44]:
normalized_matrix = uf.merge(normalized_matrix, 'column', 'mean')

In [45]:
normalized_matrix.shape

(29839, 127)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [46]:
uf.mapgenesymbols(normalized_matrix)

Progeres: 99%  29784 Out of 29839   

In [47]:
normalized_matrix.shape

# Merge Duplicate Genes By Rows

In [48]:
normalized_matrix = uf.merge(normalized_matrix, 'row', 'mean')

In [49]:
matrix.shape

(35238, 5937)

# Save Filtered Matrix

In [50]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_celline_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [51]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  127 Out of 127   

In [52]:
tertiary_matrix.head()

Unnamed: 0_level_0,22RV1,293F,A172,A375,A431,A549,A673,ASPC1,BEAS2B,BEWO,...,T24,T47D,T84,T98G,THP1,U266,U87,U937,VCAP,WSUDLCL2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2MP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Teriary Matrix

In [53]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_celline_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Path to output files

In [54]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

# Create Up Gene Set Library

In [55]:
name = 'archs4_celline_gene_up_set'

In [56]:
uf.createUpGeneSetLib(normalized_matrix, path, name)

Progeres: 100%  127 Out of 127   

# Create Down Gene Set Library

In [57]:
name = 'archs4_celline_gene_down_set'

In [58]:
uf.createDownGeneSetLib(normalized_matrix, path, name)

Progeres: 100%  127 Out of 127   

# Create Up Attribute Library

In [59]:
name = 'archs4_celline_attribute_up_set'

In [60]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  23956 Out of 23956   

# Create Down Attribute Library

In [61]:
name = 'archs4_celline_attribute_down_set'

In [62]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  23956 Out of 23956   

# Create Gene Similarity Matrix

In [63]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [64]:
gene_similarity_matix.head()

index,A1BG,A1CF,A2M,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.273559,0.236431,-0.047602,0.005709,0.034153,-0.128975,0.33217,0.063119,0.023362,...,-0.205567,-0.101939,-0.087121,-0.071282,0.000553,0.169553,0.01468,-0.022291,0.080974,0.043841
A1CF,0.273559,1.0,0.6366,-0.015143,0.138744,0.061235,-0.16328,0.125625,0.126898,0.68173,...,0.106948,-0.00611,-0.101558,0.014581,-0.012382,0.267105,0.050072,-0.042522,-0.10948,0.08105
A2M,0.236431,0.6366,1.0,0.02129,0.302984,0.099805,-0.174337,-0.014501,0.087469,0.117354,...,0.008128,-0.061724,-0.101202,0.01949,-0.022213,0.240423,0.092147,-0.050519,-0.04778,0.090788
A2ML1,-0.047602,-0.015143,0.02129,1.0,-0.006756,0.013614,0.24233,-0.009463,-0.079584,-0.006383,...,-0.086327,-0.085475,-0.042977,-0.028367,-0.027883,-0.057363,-0.040754,-0.011052,-0.053611,-0.065382
A2MP1,0.005709,0.138744,0.302984,-0.006756,1.0,0.292675,0.034807,-0.121881,0.010475,-0.006228,...,0.072381,0.034564,-0.038095,0.028527,-0.088422,0.35737,0.073605,-0.042305,0.018205,0.039404


# Save Gene Similarity Matrix 

In [65]:
filename = 'Output/archs4_celline_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [66]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [67]:
attribute_similarity_matix.head()

Unnamed: 0,22RV1,293F,A172,A375,A431,A549,A673,ASPC1,BEAS2B,BEWO,...,T24,T47D,T84,T98G,THP1,U266,U87,U937,VCAP,WSUDLCL2
22RV1,1.0,0.00032,-0.031884,0.009296,0.002175,-0.023022,-0.003752,0.060502,-0.052642,0.006221,...,-0.061014,0.086913,0.04751,-0.04083,0.007836,0.032968,0.016501,-0.03683,0.184509,0.00357
293F,0.00032,1.0,-0.093339,-0.02426,-0.112915,-0.081628,0.026968,-0.139459,0.013534,-0.029567,...,-0.068633,-0.109193,-0.076865,-0.079225,-0.082707,-0.005714,-0.089967,-0.07812,-0.022988,0.00223
A172,-0.031884,-0.093339,1.0,0.154721,0.10393,0.149249,0.061423,0.026405,0.237593,-0.010797,...,-0.03639,0.076992,-0.011683,0.291151,0.034949,0.004343,0.171542,0.038247,-0.06447,-0.035048
A375,0.009296,-0.02426,0.154721,1.0,0.052179,0.078715,0.025044,-0.019986,0.036244,-0.026496,...,0.018819,-0.027937,-0.001032,0.101217,-0.004197,0.023465,0.066339,0.030358,-0.043078,-0.035601
A431,0.002175,-0.112915,0.10393,0.052179,1.0,0.049042,-0.019924,0.087041,0.020629,0.030706,...,0.016657,0.048821,0.095836,0.043158,0.013695,-0.029874,-0.007743,0.026622,-0.074221,-0.040731


# Save Attribute Similarity Matrix

In [68]:
filename = 'Output/archs4_celline_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [69]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  23956 Out of 23956   

In [70]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A2MP1,3


In [71]:
gene_list.shape

(23956, 2)

# Save Gene List

In [72]:
filename = 'Output/archs4_celline_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [73]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [74]:
attribute_list.head()

Unnamed: 0,Attributes
0,22RV1
1,293F
2,A172
3,A375
4,A431


In [75]:
attribute_list.shape

(127, 1)

# Save Attribute List

In [76]:
filename = 'Output/archs4_celline_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [77]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [78]:
name = 'archs4_celline_gene_attribute_edge_list'

In [79]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  127 Out of 127   

 The number of statisticaly relevent gene-attribute associations is: 608457
