# The Human Protein Atlas (THPA) Normal Tissue (RNA-seq)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 04-2017 <br/>
Data Source: http://www.proteinatlas.org/about/download

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Data/rna_tissue.csv.zip', sep=',')

In [4]:
df.head()

Unnamed: 0,Gene,Gene name,Sample,Value,Unit
0,ENSG00000000003,TSPAN6,adipose tissue,31.5,TPM
1,ENSG00000000003,TSPAN6,adrenal gland,26.5,TPM
2,ENSG00000000003,TSPAN6,appendix,9.5,TPM
3,ENSG00000000003,TSPAN6,bone marrow,0.7,TPM
4,ENSG00000000003,TSPAN6,breast,53.0,TPM


In [5]:
df.shape

(726236, 5)

# Select Only Relevent Data 

In [6]:
df = df[['Gene name', 'Sample', 'Value']]

In [7]:
df.head()

Unnamed: 0,Gene name,Sample,Value
0,TSPAN6,adipose tissue,31.5
1,TSPAN6,adrenal gland,26.5
2,TSPAN6,appendix,9.5
3,TSPAN6,bone marrow,0.7
4,TSPAN6,breast,53.0


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [8]:
df.set_index('Gene name', inplace=True)

In [9]:
uf.mapgenesymbols(df)

Progeres: 100%  726236 Out of 726236   

In [10]:
df.shape

(706626, 2)

# Merge Duplicates

In [12]:
df.reset_index(inplace=True)

In [13]:
df = df.groupby(['Sample', 'Gene name']).mean()

In [14]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Sample,Gene name,Unnamed: 2_level_1
adipose tissue,A1BG,0.1
adipose tissue,A1CF,0.0
adipose tissue,A2M,496.6
adipose tissue,A2ML1,0.2
adipose tissue,A3GALT2,0.1


In [15]:
df.shape

(702556, 1)

# Create Matrix

In [17]:
df.reset_index(inplace=True)

In [68]:
df.head()

Unnamed: 0_level_0,Gene name,Value
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1
adipose tissue,A1BG,0.1
adipose tissue,A1CF,0.0
adipose tissue,A2M,496.6
adipose tissue,A2ML1,0.2
adipose tissue,A3GALT2,0.1


In [72]:
genes = df['Gene name'].unique().tolist()
tissues = df['Sample'].unique().tolist()

df.set_index('Sample', inplace=True)

matrix = pd.DataFrame(index=genes, columns=tissues)

matrix.replace(np.nan, 0, inplace=True)

for i,col in enumerate(matrix.columns):
    
    progress = ((i+1)/len(matrix.columns))*100
        
    sys.stdout.write("Progress: %d%%   \r" % (progress))
    sys.stdout.flush()
    
    matrix.ix[df.loc[col, 'Gene name'].values.tolist(), col] = df.ix[col, 'Value'].values.tolist()

Progress: 100%   

In [73]:
matrix.head()

Unnamed: 0,adipose tissue,adrenal gland,appendix,bone marrow,breast,cerebral cortex,"cervix, uterine",colon,duodenum,endometrium,...,skeletal muscle,skin,small intestine,smooth muscle,spleen,stomach,testis,thyroid gland,tonsil,urinary bladder
A1BG,0.1,0.1,0.1,0.0,0.1,0.5,0.3,0.0,0.1,0.5,...,0.0,0.0,0.0,0.1,0.4,0.1,0.0,0.2,0.0,0.1
A1CF,0.0,0.0,0.8,0.0,0.0,0.0,0.0,11.4,24.2,0.0,...,0.0,0.0,30.4,0.4,0.0,1.1,0.0,0.0,0.0,0.0
A2M,496.6,228.3,193.0,3.1,190.4,172.5,289.7,246.0,138.6,483.0,...,49.4,51.0,192.5,718.9,321.1,101.8,110.5,202.9,53.8,686.0
A2ML1,0.2,0.1,0.1,0.1,5.8,3.6,2.1,0.1,0.0,0.8,...,0.1,85.7,0.1,0.3,0.2,0.2,9.6,0.1,47.0,0.4
A3GALT2,0.1,0.0,0.2,0.1,0.0,0.2,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.1,0.0,0.8,0.1,0.0,0.0


In [74]:
matrix.shape

(18988, 37)

## Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [75]:
matrix.replace(0, np.nan, inplace=True)

In [76]:
matrix.dropna(thresh=(0.05*matrix.shape[1]), axis=0, inplace=True)

In [77]:
matrix.replace(np.nan, 0, inplace=True)

In [78]:
matrix.shape

(18200, 37)

## Save Unfiltered Sample Matrix To File

In [80]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_tissue_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Filtered Matrix

## Normalize Matrix (Quantile Normalize the matrix for the columns)

In [81]:
normalized_matrix = matrix.copy()

In [82]:
normalized_matrix = uf.quantileNormalize(normalized_matrix)

Step 2/2 progress: 100%  37 Out of 37   

In [83]:
normalized_matrix.head()

Unnamed: 0,adipose tissue,adrenal gland,appendix,bone marrow,breast,cerebral cortex,"cervix, uterine",colon,duodenum,endometrium,...,skeletal muscle,skin,small intestine,smooth muscle,spleen,stomach,testis,thyroid gland,tonsil,urinary bladder
A1BG,0.089189,0.083784,0.035135,0.0,0.032432,0.239189,0.208108,0.0,0.07973,0.313514,...,0.0,0.0,0.0,0.056757,0.391892,0.051351,0.0,0.167568,0.0,0.1
A1CF,0.0,0.0,0.6,0.0,0.0,0.0,0.0,12.995946,26.389189,0.0,...,0.0,0.0,30.367568,0.332432,0.0,1.078378,0.0,0.0,0.0,0.0
A2M,412.502703,203.264865,183.456757,6.883784,195.289189,131.872973,243.975676,256.07027,140.302703,425.940541,...,87.616216,55.413514,181.394595,604.827027,334.175676,141.105405,85.3,159.748649,65.162162,718.87027
A2ML1,0.213514,0.083784,0.035135,0.377027,5.435135,2.137838,1.597297,0.059459,0.0,0.512162,...,0.613514,95.937838,0.048649,0.245946,0.178378,0.151351,5.348649,0.075676,56.897297,0.404054
A3GALT2,0.089189,0.0,0.121622,0.377027,0.0,0.07027,0.0,0.0,0.0,0.021622,...,0.0,0.0,0.0,0.0,0.067568,0.0,0.097297,0.075676,0.0,0.0


## Nomalize Matrix (z-score the rows)

In [84]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  18200 Out of 18200   

In [85]:
normalized_matrix.head()

Unnamed: 0,adipose tissue,adrenal gland,appendix,bone marrow,breast,cerebral cortex,"cervix, uterine",colon,duodenum,endometrium,...,skeletal muscle,skin,small intestine,smooth muscle,spleen,stomach,testis,thyroid gland,tonsil,urinary bladder
A1BG,-0.165053,-0.165117,-0.165691,-0.166105,-0.165723,-0.163283,-0.16365,-0.166105,-0.165165,-0.162406,...,-0.166105,-0.166105,-0.166105,-0.165436,-0.161482,-0.165499,-0.166105,-0.164128,-0.166105,-0.164925
A1CF,-0.334082,-0.334082,-0.29721,-0.334082,-0.334082,-0.334082,-0.334082,0.464564,1.287625,-0.334082,...,-0.334082,-0.334082,1.53211,-0.313653,-0.334082,-0.267812,-0.334082,-0.334082,-0.334082,-0.334082
A2M,0.268824,-0.283665,-0.335968,-0.802206,-0.304725,-0.472174,-0.176169,-0.144233,-0.449916,0.304306,...,-0.589034,-0.674064,-0.341413,0.776653,0.062003,-0.447796,-0.595149,-0.398569,-0.648323,1.077782
A2ML1,-0.231959,-0.233966,-0.234719,-0.22943,-0.151189,-0.202193,-0.210554,-0.234342,-0.235262,-0.22734,...,-0.225772,1.248754,-0.23451,-0.231458,-0.232503,-0.232921,-0.152527,-0.234092,0.644855,-0.229012
A3GALT2,0.822703,-0.432163,1.279018,4.8725,-0.432163,0.556519,-0.432163,-0.432163,-0.432163,-0.127953,...,-0.432163,-0.432163,-0.432163,-0.432163,0.518493,-0.432163,0.936782,0.632572,-0.432163,-0.432163


## Save Filtered Sample Matrix To File

In [86]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_tissue_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [87]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  37 Out of 37   

In [88]:
tertiary_matrix.head()

Unnamed: 0,adipose tissue,adrenal gland,appendix,bone marrow,breast,cerebral cortex,"cervix, uterine",colon,duodenum,endometrium,...,skeletal muscle,skin,small intestine,smooth muscle,spleen,stomach,testis,thyroid gland,tonsil,urinary bladder
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A3GALT2,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Teriary Matrix

In [89]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_tissue_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Path to output files

In [90]:
path = '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/Output/'

# Create Up Gene Set Library

In [91]:
name = 'thpa_rns-seq_tissue_gene_up_set'

In [92]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  37 Out of 37   

# Create Down Gene Set Library

In [93]:
name = 'thpa_rns-seq_tissue_gene_down_set'

In [94]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  37 Out of 37   

# Create Up Attribute Library

In [95]:
name = 'thpa_rns-seq_tissue_attribute_up_set'

In [96]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  18200 Out of 18200   

# Create Down Attribute Library

In [97]:
name = 'thpa_rns-seq_tissue_attribute_down_set'

In [98]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  18200 Out of 18200   

# Create Gene Similarity Matrix

In [99]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [100]:
gene_similarity_matix.head()

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
A1BG,1.0,0.895837,0.128103,-0.039286,-0.073318,-0.186501,-0.038718,-0.036239,-0.144314,0.750724,...,-0.026739,-0.065444,-0.02329,0.722037,-0.032261,0.180666,0.14242,-0.140579,-0.221973,0.137181
A1CF,0.895837,1.0,0.076569,-0.079756,-0.129572,-0.256985,-0.025464,-0.143835,-0.141573,0.888519,...,-0.05342,0.028718,-0.177553,0.556671,-0.117152,0.167727,0.041663,-0.239193,0.070904,0.021901
A2M,0.128103,0.076569,1.0,-0.092244,-0.125049,0.017595,-0.082486,-0.201471,-0.210375,0.037844,...,-0.126354,-0.257908,-0.20157,-0.014744,-0.180507,-0.104677,-0.056038,0.294454,-0.198491,-0.117039
A2ML1,-0.039286,-0.079756,-0.092244,1.0,-0.098952,-0.023964,-0.048936,-0.035131,0.187581,-0.073947,...,0.017872,0.097059,-0.16253,-0.129297,-0.045117,-0.032205,-0.031156,-0.038961,0.072403,-0.07085
A3GALT2,-0.073318,-0.129572,-0.125049,-0.098952,1.0,-0.065558,-0.062595,-0.114968,-0.136781,-0.141043,...,0.207131,0.335893,0.193749,0.178502,0.670389,0.091652,-0.066,0.166555,0.156692,-0.145789


## Save Gene Similarity Matrix 

In [101]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_tissue_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [104]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [105]:
attribute_similarity_matix.head()

Unnamed: 0,adipose tissue,adrenal gland,appendix,bone marrow,breast,cerebral cortex,"cervix, uterine",colon,duodenum,endometrium,...,skeletal muscle,skin,small intestine,smooth muscle,spleen,stomach,testis,thyroid gland,tonsil,urinary bladder
adipose tissue,1.0,0.053329,0.106147,-0.071419,-0.032032,-0.011583,0.086261,-0.051826,-0.079051,0.028008,...,0.042843,-0.099904,-0.050111,0.197637,0.068123,-0.08202,-0.125928,-0.153312,-0.215423,0.100442
adrenal gland,0.053329,1.0,-0.112061,-0.123585,-0.039166,0.143037,-0.029611,-0.100708,-0.051016,-0.039364,...,0.034312,-0.110716,-0.078715,-0.059918,-0.07667,-0.069541,-0.053366,0.009893,-0.191405,-0.080774
appendix,0.106147,-0.112061,1.0,0.292532,-0.165149,-0.134048,-0.152861,0.066702,0.04864,-0.104098,...,-0.149433,-0.056091,0.096078,0.034581,0.503168,0.044819,-0.106224,-0.262302,0.465319,0.254808
bone marrow,-0.071419,-0.123585,0.292532,1.0,-0.111303,-0.095795,-0.232554,-0.103533,-0.040356,-0.116809,...,-0.007626,0.040091,-0.052518,-0.15442,0.24805,-0.060917,0.046776,-0.096263,0.256166,-0.106588
breast,-0.032032,-0.039166,-0.165149,-0.111303,1.0,-0.140857,0.166933,-0.012535,-0.177966,0.099306,...,-0.136044,0.027134,-0.17759,0.023249,-0.166451,-0.079952,-0.062901,0.138586,0.084547,0.022727


## Save Attribute Similarity Matrix

In [106]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_tissue_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [107]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  18200 Out of 18200   

In [108]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A3GALT2,127550


In [109]:
gene_list.shape

(18200, 2)

### Save Gene List

In [111]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_tissue_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [113]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [114]:
attribute_list.head()

Unnamed: 0,Attributes
0,adipose tissue
1,adrenal gland
2,appendix
3,bone marrow
4,breast


In [115]:
attribute_list.shape

(37, 1)

### Save Attribute List

In [116]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_tissue_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [117]:
path = '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/Output/'

In [118]:
name = 'thpa_rns-seq_tissue_gene_attribute_edge_list'

In [119]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  37 Out of 37   

 The number of statisticaly relevent gene-attribute associations is: 134717
