# The Human Protein Atlas (THPA) Normal Tissue (immunohistochemisty)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 04-2017 <br/>
Data Source: http://www.proteinatlas.org/about/download

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Data/normal_tissue.csv.zip', sep=',')

In [4]:
df.head()

Unnamed: 0,Gene,Gene name,Tissue,Cell type,Level,Reliability
0,ENSG00000000003,TSPAN6,adrenal gland,glandular cells,Not detected,Uncertain
1,ENSG00000000003,TSPAN6,appendix,glandular cells,Medium,Uncertain
2,ENSG00000000003,TSPAN6,appendix,lymphoid tissue,Not detected,Uncertain
3,ENSG00000000003,TSPAN6,bone marrow,hematopoietic cells,Not detected,Uncertain
4,ENSG00000000003,TSPAN6,breast,adipocytes,Not detected,Uncertain


In [5]:
df.shape

(1031835, 6)

# Select Only Relevent Data 

In [6]:
df = df[df['Level'] == 'High']

In [7]:
lst = []

for i,index in enumerate(df.index):
    
    progress = ((i+1)/len(df.index))*100
        
    sys.stdout.write("Progress: %d%%   \r" % (progress))
    sys.stdout.flush()
    
    lst.append(df.ix[index, 'Tissue']+'-'+df.ix[index, 'Cell type'])
    
df['Tissue'] = lst

Progress: 100%   

In [8]:
df = df[['Gene name', 'Tissue']]

In [9]:
df.head()

Unnamed: 0,Gene name,Tissue
5,TSPAN6,breast-glandular cells
7,TSPAN6,bronchus-respiratory epithelial cells
17,TSPAN6,"cervix, uterine-glandular cells"
18,TSPAN6,"cervix, uterine-squamous epithelial cells"
24,TSPAN6,endometrium 1-glandular cells


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [10]:
df.set_index('Gene name', inplace=True)

In [11]:
uf.mapgenesymbols(df)

Progeres: 100%  118588 Out of 118588   

In [12]:
df.shape

(117858, 1)

# Create Binary Matrix

In [13]:
df.reset_index(inplace=True)

In [14]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  9490 Out of 9490   

In [15]:
binary_matrix.head()

Unnamed: 0,thyroid gland-glandular cells,duodenum-glandular cells,skeletal muscle-myocytes,spleen-cells in white pulp,retina-ganglion cells,adrenal gland-cells in zona fasciculata,stomach 1-glandular cells,skin-sweat ducts,lung-pneumocytes,salivary gland-glandular cells,...,soft tissue 1-chondrocytes,soft tissue 1-peripheral nerve,cerebral cortex-endothelial cells,tonsil-non-germinal center cells,pituitary gland-cells in anterior,placenta-trophoblastic cells,fallopian tube-glandular cells,soft tissue 2-peripheral nerve,skin-sebaceous cells,spleen-cells in red pulp
CYP2A7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NPIPA8,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
RORA,0,1,0,0,0,0,1,0,1,1,...,0,1,0,1,0,1,1,1,0,0
SHMT1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
SPECC1L,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
binary_matrix.shape

(9490, 108)

# Save Binary Matrix

In [17]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_normal_tissue_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [18]:
path = '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/Output/'

In [19]:
name = 'thpa_normal_tissue_gene_set'

In [20]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  108 Out of 108   

# Create Attribute Library

In [21]:
path = '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/Output/'

In [22]:
name = 'thpa_normal_tissue_attribute_set'

In [23]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  9490 Out of 9490   

# Create Gene Similarity Matrix

In [24]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [25]:
gene_similarity_matix.head()

Unnamed: 0,CYP2A7,NPIPA8,RORA,SHMT1,SPECC1L,DCUN1D3,MAGI2,C19orf60,GTF2I,CRABP1,...,KLF3,ANKDD1A,FRYL,CUL5,IRAK4,EPSTI1,MGAT5,ST18,FDXR,KLK13
CYP2A7,1.0,0.0,0.021739,0.125,0.0,0.0,0.0,0.0,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NPIPA8,0.0,1.0,0.313725,0.16,0.0,0.090909,0.047619,0.090909,0.386364,0.032258,...,0.307692,0.043478,0.090909,0.0,0.172414,0.225806,0.0,0.130435,0.038462,0.047619
RORA,0.021739,0.313725,1.0,0.125,0.0,0.020833,0.0,0.020833,0.387097,0.075472,...,0.453125,0.042553,0.065217,0.102041,0.156863,0.26,0.021739,0.02,0.083333,0.021739
SHMT1,0.125,0.16,0.125,1.0,0.0,0.1,0.0,0.1,0.170732,0.1875,...,0.145833,0.0,0.0,0.0,0.05,0.086957,0.0,0.083333,0.0,0.0
SPECC1L,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.025,0.0,...,0.021277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0


## Save Gene Similarity Matrix 

In [26]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_normal_tisue _gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [27]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [28]:
attribute_similarity_matix.head()

Unnamed: 0,thyroid gland-glandular cells,duodenum-glandular cells,skeletal muscle-myocytes,spleen-cells in white pulp,retina-ganglion cells,adrenal gland-cells in zona fasciculata,stomach 1-glandular cells,skin-sweat ducts,lung-pneumocytes,salivary gland-glandular cells,...,soft tissue 1-chondrocytes,soft tissue 1-peripheral nerve,cerebral cortex-endothelial cells,tonsil-non-germinal center cells,pituitary gland-cells in anterior,placenta-trophoblastic cells,fallopian tube-glandular cells,soft tissue 2-peripheral nerve,skin-sebaceous cells,spleen-cells in red pulp
thyroid gland-glandular cells,1.0,0.351171,0.214698,0.210151,0.000468,0.0,0.34806,0.0,0.27027,0.352709,...,0.091302,0.08927,0.107414,0.273082,0.0,0.386678,0.379225,0.101391,0.0,0.202283
duodenum-glandular cells,0.351171,1.0,0.160724,0.177919,0.0,0.0,0.503026,0.0,0.173538,0.304747,...,0.063338,0.057844,0.078481,0.243833,0.0,0.365211,0.370757,0.064882,0.0,0.148248
skeletal muscle-myocytes,0.214698,0.160724,1.0,0.160072,0.0,0.0,0.170431,0.0,0.209853,0.21031,...,0.093385,0.112158,0.131737,0.174679,0.0,0.165692,0.174956,0.139693,0.0,0.181707
spleen-cells in white pulp,0.210151,0.177919,0.160072,1.0,0.0,0.0,0.185208,0.0,0.225806,0.222062,...,0.084237,0.102493,0.120223,0.443494,0.0,0.173913,0.214608,0.11786,0.0,0.379233
retina-ganglion cells,0.000468,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000691,0.0,0.0,0.0,0.0,0.0,0.0


## Save Attribute Similarity Matrix

In [29]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_normal_tissue_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [30]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  9490 Out of 9490   

In [31]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,CYP2A7,1549.0
1,NPIPA8,101060000.0
2,RORA,6095.0
3,SHMT1,6470.0
4,SPECC1L,23384.0


In [32]:
gene_list.shape

(9490, 2)

### Save Gene List

In [33]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_normal_tissue_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [34]:
attribute_list = uf.createAttributeList(binary_matrix)

In [35]:
attribute_list.head()

Unnamed: 0,Attributes
0,thyroid gland-glandular cells
1,duodenum-glandular cells
2,skeletal muscle-myocytes
3,spleen-cells in white pulp
4,retina-ganglion cells


In [36]:
attribute_list.shape

(108, 1)

### Save Attribute List

In [37]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_normal_tissue_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [38]:
path = '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/Output/'

In [39]:
name = 'thpa_normal_tissue_gene_attribute_edge_list'

In [40]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  108 Out of 108   

 The number of statisticaly relevent gene-attribute associations is: 117708
