# JENSEN LAB (TISSUE)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: http://tissues.jensenlab.org/Search

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Tissues/my_functions.py'>

# LOAD DATA

In [3]:
col = ['Ensemble Acc', 'GeneSym', 'BTO', 'Tissue', 'Source', 'SampleInfo', 'Value']

In [4]:
dfE = pd.read_csv('Input/human_tissue_experiments_filtered.tsv', sep='\t', names=col)

In [5]:
dfK = pd.read_csv('Input/human_tissue_knowledge_filtered.tsv', sep='\t', names=col)

In [6]:
dfT = pd.read_csv('Input/human_tissue_textmining_filtered.tsv', sep='\t', names=col)

In [7]:
df = pd.concat([dfE, dfK, dfT])

In [8]:
df.head()

Unnamed: 0,Ensemble Acc,GeneSym,BTO,Tissue,Source,SampleInfo,Value
0,ENSP00000000233,ARF5,BTO:0000041,Medulla oblongata,GNF,103 Intensity units,0
1,ENSP00000000233,ARF5,BTO:0000045,Adrenal cortex,GNF,50 Intensity units,0
2,ENSP00000000233,ARF5,BTO:0000047,Adrenal gland,HPA-RNA,61.1 FPKM,1
3,ENSP00000000233,ARF5,BTO:0000047,Adrenal gland,HPM,6 peptides,0
4,ENSP00000000233,ARF5,BTO:0000084,Vermiform appendix,Exon array,151 intensity units,0


In [9]:
df.shape

(1732815, 7)

# Get Only Gene-Tissue Data

In [10]:
df = df[['GeneSym', 'Tissue']]

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.set_index('GeneSym', inplace=True)

In [13]:
df.head()

Unnamed: 0_level_0,Tissue
GeneSym,Unnamed: 1_level_1
ARF5,Medulla oblongata
ARF5,Adrenal cortex
ARF5,Adrenal gland
ARF5,Vermiform appendix
ARF5,Blood


In [14]:
df.shape

(1045367, 1)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [15]:
mf.mapgenesymbols(df)

Progeres: 100%  1045367 Out of 1045367   

In [16]:
df.shape

(1030026, 1)

# Create Binary Matrix

In [17]:
df.reset_index(inplace=True)

In [18]:
binary_matrix = mf.createBinaryMatix(df)

Progeres: 100%  18876 Out of 18876   

In [19]:
binary_matrix.head()

Unnamed: 0,MOLT-3 cell,ib92,208f,Bolting stage,Parahippocampal region,Bony labyrinth,Colon sigmoideum,Ciliary epithelium,LAD-2 cell,Kurloff cell,...,mp12,bm87,ccrfhsb2,BT-549 cell,lm4,Intercostal muscle,b82,aml193,Vegetative cell,MALME-3M cell
IRAK3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
INHBC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ITGB8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PEX16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MED17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
binary_matrix.shape

(18876, 4099)

# Save Binary Matrix

In [21]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [22]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Tissues/Output/'

In [23]:
name = 'jensen_tissue_gene_set'

In [24]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  4099 Out of 4099   

# Create Attribute Library

In [25]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Tissues/Output/'

In [26]:
name = 'jensen_tissue_attribute_set'

In [27]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18876 Out of 18876   

# Create Gene Similarity Matrix

In [28]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [29]:
gene_similarity_matix.head()

Unnamed: 0,IRAK3,INHBC,ITGB8,PEX16,MED17,GGTLC3,SYPL2,GYPA,INE1,KCNK3,...,STARD3NL,NAPA,ATP6V1A,BSND,C12orf50,GNA11,GPHA2,EWSAT1,CFHR2,WDR82
IRAK3,1.0,0.036364,0.6875,0.653333,0.447368,0.396552,0.722222,0.38961,0.358491,0.328125,...,0.53125,0.5,0.494949,0.218182,0.740741,0.511628,0.090909,0.0,0.298246,0.37
INHBC,0.036364,1.0,0.053571,0.041667,0.051724,0.066667,0.047619,0.035714,0.0,0.028571,...,0.031579,0.029703,0.03125,0.0,0.046512,0.038462,0.222222,0.0,0.086957,0.035294
ITGB8,0.6875,0.053571,1.0,0.68,0.513514,0.431034,0.610169,0.379747,0.275862,0.318182,...,0.536082,0.55,0.530612,0.210526,0.627119,0.571429,0.107143,0.0,0.333333,0.43299
PEX16,0.653333,0.041667,0.68,1.0,0.641026,0.375,0.541667,0.315789,0.267606,0.303797,...,0.683673,0.660194,0.66,0.164384,0.577465,0.626374,0.083333,0.0,0.243243,0.519608
MED17,0.447368,0.051724,0.513514,0.641026,1.0,0.307692,0.385714,0.275862,0.333333,0.271429,...,0.49505,0.495238,0.52,0.126984,0.324324,0.55814,0.103448,0.0,0.3,0.58427


## Save Gene Similarity Matrix 

In [30]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [31]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [32]:
attribute_similarity_matix.head()

Unnamed: 0,MOLT-3 cell,ib92,208f,Bolting stage,Parahippocampal region,Bony labyrinth,Colon sigmoideum,Ciliary epithelium,LAD-2 cell,Kurloff cell,...,mp12,bm87,ccrfhsb2,BT-549 cell,lm4,Intercostal muscle,b82,aml193,Vegetative cell,MALME-3M cell
MOLT-3 cell,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ib92,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208f,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bolting stage,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Parahippocampal region,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Attribute Similarity Matrix

In [33]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [34]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  18876 Out of 18876   

In [35]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,IRAK3,11213
1,INHBC,3626
2,ITGB8,3696
3,PEX16,9409
4,MED17,9440


In [36]:
gene_list.shape

(18876, 2)

### Save Gene List

In [37]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [38]:
attribute_list = mf.createAttributeList(binary_matrix)

In [39]:
attribute_list.head()

Unnamed: 0,Attributes
0,MOLT-3 cell
1,ib92
2,208f
3,Bolting stage
4,Parahippocampal region


In [40]:
attribute_list.shape

(4099, 1)

### Save Attribute List

In [41]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [42]:
gene_attribute_edge_list = mf.createGeneAttributeEdgeList(binary_matrix, gene_list)

Progeres: 100%  4099 Out of 4099   

In [43]:
gene_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,MOLT-3 cell,IRAK3,11213,0.0
1,MOLT-3 cell,INHBC,3626,0.0
2,MOLT-3 cell,ITGB8,3696,0.0
3,MOLT-3 cell,PEX16,9409,0.0
4,MOLT-3 cell,MED17,9440,0.0


In [44]:
gene_attribute_edge_list.shape

(77372724, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [45]:
gene_attribute_edge_list[gene_attribute_edge_list['Weight'] != 0].shape

(1026484, 4)

### Save Gene-Attribute Edge List

In [46]:
filename = '~/./Documents/Harmonizome/Jensen/Tissues/Output/jensen_tissue_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')