# JENSEN LAB (TISSUE)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: http://tissues.jensenlab.org/Search

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Tissues/untility_functions.py'>

# LOAD DATA

In [3]:
col = ['Ensemble Acc', 'GeneSym', 'BTO', 'Tissue', 'Source', 'SampleInfo', 'Value']

In [4]:
dfE = pd.read_csv('Input/human_tissue_experiments_filtered.tsv', sep='\t', names=col)

In [5]:
dfK = pd.read_csv('Input/human_tissue_knowledge_filtered.tsv', sep='\t', names=col)

In [6]:
dfT = pd.read_csv('Input/human_tissue_textmining_filtered.tsv', sep='\t', names=col)

In [7]:
df = pd.concat([dfE, dfK, dfT])

In [8]:
df.head()

Unnamed: 0,Ensemble Acc,GeneSym,BTO,Tissue,Source,SampleInfo,Value
0,ENSP00000000233,ARF5,BTO:0000041,Medulla oblongata,GNF,103 Intensity units,0
1,ENSP00000000233,ARF5,BTO:0000045,Adrenal cortex,GNF,50 Intensity units,0
2,ENSP00000000233,ARF5,BTO:0000047,Adrenal gland,HPA-RNA,61.1 FPKM,1
3,ENSP00000000233,ARF5,BTO:0000047,Adrenal gland,HPM,6 peptides,0
4,ENSP00000000233,ARF5,BTO:0000084,Vermiform appendix,Exon array,151 intensity units,0


In [9]:
df.shape

(1732815, 7)

# Get Only Gene-Tissue Data

In [10]:
df = df[df['Value'] != 0]

In [11]:
df = df[['GeneSym', 'Tissue']]

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.set_index('GeneSym', inplace=True)

In [14]:
df.head()

Unnamed: 0_level_0,Tissue
GeneSym,Unnamed: 1_level_1
ARF5,Adrenal gland
ARF5,Vermiform appendix
ARF5,Blood
ARF5,Bone marrow
ARF5,Brain


In [15]:
df.shape

(440455, 1)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [16]:
uf.mapgenesymbols(df)

Progeres: 99%  440390 Out of 440455   

In [17]:
df.shape

# Create Binary Matrix

In [18]:
df.reset_index(inplace=True)

In [19]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  18565 Out of 18565   

In [20]:
binary_matrix.head()

Unnamed: 0,Oc cell,MC3T3-E1 cell,colo684,MSTO-211H cell,Inner medullary collecting duct,Middle ear,Photophore,Bursa copulatrix,Parahippocampal gyrus,A-1235 cell,...,Meynert's basal nucleus,wehi279,sv3t3,Parietal ganglion,Embryogenic cell line,dh14,Respiratory epithelium cell,gpe86,c6bu1,Abductor
LRRN1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GJD4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GPX8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DNAL1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NKAP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
binary_matrix.shape

(18565, 4098)

# Save Binary Matrix

In [22]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [23]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Tissues/Output/'

In [24]:
name = 'jensen_tissue_gene_set'

In [25]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  4098 Out of 4098   

# Create Attribute Library

In [26]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Tissues/Output/'

In [27]:
name = 'jensen_tissue_attribute_set'

In [28]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18565 Out of 18565   

# Create Gene Similarity Matrix

In [29]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [30]:
gene_similarity_matix.head()

Unnamed: 0,LRRN1,GJD4,GPX8,DNAL1,NKAP,SMAP1,DNAJC7,HSPA1L,RAB40B,KIF21B,...,GGTLC1,P2RX1,CFAP206,VWA1,PDIA2,HMX1,ADGRL4,AMT,ZNF618,RNF5
LRRN1,1.0,0.037037,0.090909,0.157895,0.081081,0.119048,0.116667,0.068966,0.133333,0.136364,...,0.058824,0.060606,0.117647,0.088235,0.066667,0.0,0.033333,0.115385,0.083333,0.061224
GJD4,0.037037,1.0,0.0,0.090909,0.166667,0.225,0.131148,0.03125,0.090909,0.130435,...,0.052632,0.088235,0.0,0.054054,0.030303,0.0,0.0,0.22449,0.2,0.08
GPX8,0.090909,0.0,1.0,0.25,0.085714,0.285714,0.181818,0.16,0.066667,0.066667,...,0.142857,0.1,0.214286,0.296296,0.153846,0.0,0.26087,0.217391,0.190476,0.219512
DNAL1,0.157895,0.090909,0.25,1.0,0.2,0.228571,0.145455,0.125,0.153846,0.095238,...,0.4,0.033333,0.363636,0.178571,0.166667,0.0,0.125,0.173913,0.170732,0.116279
NKAP,0.081081,0.166667,0.085714,0.2,1.0,0.297872,0.35,0.153846,0.090909,0.230769,...,0.103448,0.042553,0.1,0.159091,0.121951,0.0,0.153846,0.333333,0.32,0.294118


## Save Gene Similarity Matrix 

In [31]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [32]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [33]:
attribute_similarity_matix.head()

Unnamed: 0,Oc cell,MC3T3-E1 cell,colo684,MSTO-211H cell,Inner medullary collecting duct,Middle ear,Photophore,Bursa copulatrix,Parahippocampal gyrus,A-1235 cell,...,Meynert's basal nucleus,wehi279,sv3t3,Parietal ganglion,Embryogenic cell line,dh14,Respiratory epithelium cell,gpe86,c6bu1,Abductor
Oc cell,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MC3T3-E1 cell,0.0,1.0,0.0,0.0,0.0,0.011364,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
colo684,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MSTO-211H cell,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Inner medullary collecting duct,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Attribute Similarity Matrix

In [34]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [35]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  18565 Out of 18565   

In [36]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,LRRN1,57633
1,GJD4,219770
2,GPX8,493869
3,DNAL1,83544
4,NKAP,79576


In [37]:
gene_list.shape

(18565, 2)

### Save Gene List

In [38]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [39]:
attribute_list = uf.createAttributeList(binary_matrix)

In [40]:
attribute_list.head()

Unnamed: 0,Attributes
0,Oc cell
1,MC3T3-E1 cell
2,colo684
3,MSTO-211H cell
4,Inner medullary collecting duct


In [41]:
attribute_list.shape

(4098, 1)

### Save Attribute List

In [42]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [43]:
gene_attribute_edge_list = uf.createGeneAttributeEdgeList(binary_matrix, gene_list)

Progeres: 100%  4098 Out of 4098   

In [44]:
gene_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,Oc cell,LRRN1,57633,0.0
1,Oc cell,GJD4,219770,0.0
2,Oc cell,GPX8,493869,0.0
3,Oc cell,DNAL1,83544,0.0
4,Oc cell,NKAP,79576,0.0


In [45]:
gene_attribute_edge_list.shape

(76079370, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [46]:
gene_attribute_edge_list[gene_attribute_edge_list['Weight'] != 0].shape

(434311, 4)

### Save Gene-Attribute Edge List

In [47]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')