# bgee Mouse Anatomical Entity

Author: Moshe Silverstein  
Date: 08-18  
Data Source Home: https://bgee.org/     
Data Source Download: https://bgee.org/?page=download&action=expr_calls#id1 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

# Path to Output Files

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/bgee/Output/'

# Load Data

In [4]:
df = pd.read_csv('Input/Mus_musculus_expr_advanced_development.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0,Gene ID,Gene name,Anatomical entity ID,Anatomical entity name,Developmental stage ID,Developmental stage name,Expression,Call quality,Expression rank,Including observed data,...,In situ hybridization experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including in situ hybridization observed data,RNA-Seq data,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a high quality,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including RNA-Seq observed data
0,ENSMUSG00000000001,Gnai3,CL:0000023,oocyte,UBERON:0000104,life cycle,present,gold quality,1120,yes,...,0,0,0,no,no data,0,0,0,0,no
1,ENSMUSG00000000001,Gnai3,CL:0000025,egg cell,UBERON:0000113,post-juvenile adult stage,present,silver quality,18600,yes,...,0,0,0,no,no data,0,0,0,0,no
2,ENSMUSG00000000001,Gnai3,CL:0000057,fibroblast,MmusDv:0000052,8 weeks (mouse),present,silver quality,1850,yes,...,0,0,0,no,present,1,0,0,0,yes
3,ENSMUSG00000000001,Gnai3,CL:0000365,zygote,UBERON:0000106,zygote stage,present,gold quality,1410,yes,...,0,0,0,no,no data,0,0,0,0,no
4,ENSMUSG00000000001,Gnai3,CL:0000510,paneth cell,MmusDv:0000050,6 weeks (mouse),present,silver quality,1820,yes,...,0,0,0,no,no data,0,0,0,0,no


In [6]:
df.shape

(15706235, 32)

# Get relevant data

In [7]:
df = df[['Gene name', 'Anatomical entity name', 'Developmental stage name', 'Expression', 'Call quality']]

In [8]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name,Developmental stage name,Expression,Call quality
0,Gnai3,oocyte,life cycle,present,gold quality
1,Gnai3,egg cell,post-juvenile adult stage,present,silver quality
2,Gnai3,fibroblast,8 weeks (mouse),present,silver quality
3,Gnai3,zygote,zygote stage,present,gold quality
4,Gnai3,paneth cell,6 weeks (mouse),present,silver quality


In [9]:
df = df[df['Expression'] == 'present']

In [10]:
df = df[['Gene name', 'Anatomical entity name']]

In [11]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name
0,Gnai3,oocyte
1,Gnai3,egg cell
2,Gnai3,fibroblast
3,Gnai3,zygote
4,Gnai3,paneth cell


In [12]:
df.shape

(10506933, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [13]:
df.set_index('Gene name', inplace=True)

In [14]:
uf.mapgenesymbols(df)

Progeres: 99%  10506385 Out of 10506933   

# Drop Duplicates

In [15]:
df.reset_index(inplace=True)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name
0,GNAI3,oocyte
1,GNAI3,egg cell
2,GNAI3,fibroblast
3,GNAI3,zygote
4,GNAI3,paneth cell


In [18]:
df.shape

(3163055, 2)

# Create Binary Matrix

In [19]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  16289 Out of 16289   

In [20]:
binary_matrix.head()

Unnamed: 0,rhombomere lateral wall,pons,rete ovarii of mesonephros (mouse),bronchiole,mesenchyme of umbilical cord,posteromedial cortical amygdaloid nucleus,dorsal motor nucleus of vagus nerve,L5 vertebral cartilage condensation (mouse),subscapularis muscle,hilus of dentate gyrus,...,CA2 field of hippocampus,external female genitalia,ventral part of pharyngeal pouch 3,interdigital region between manual digits 4 and 5,acoustico-facial VII-VIII ganglion complex,inferior vestibular nucleus,liver primordium,internal capsule of telencephalon,layer of neocortex,mesenchyme of rest of paramesonephric duct of female (mouse)
HOXB4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRNAU1AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LRBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RNF181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SNRPF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
binary_matrix.shape

(16289, 3142)

# Save Binary Matrix

In [22]:
filename = path+'bgee_mouse_anatomical_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [27]:
name = 'bgee_mouse_anatomical_gene_set'

In [28]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  3142 Out of 3142   

# Create Attribute Library

In [29]:
name = 'bgee_mouse_anatomical_attribute_set'

In [30]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  16289 Out of 16289   

# Create Gene List

In [23]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  16289 Out of 16289   

In [24]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,HOXB4,3214
1,TRNAU1AP,54952
2,LRBA,987
3,RNF181,51255
4,SNRPF,6636


In [25]:
gene_list.shape

(16289, 2)

# Save Gene List

In [26]:
filename = path+'bgee_mouse_anatomical_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [31]:
attribute_list = uf.createAttributeList(binary_matrix)

In [32]:
attribute_list.head()

rhombomere lateral wall
pons
rete ovarii of mesonephros (mouse)
bronchiole
mesenchyme of umbilical cord


In [33]:
attribute_list.shape

(3142, 0)

# Save Attribute List

In [34]:
filename = path+'bgee_mouse_anatomical_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute Similarity matrix

In [35]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [36]:
attribute_similarity_matix.head()

Unnamed: 0,rhombomere lateral wall,pons,rete ovarii of mesonephros (mouse),bronchiole,mesenchyme of umbilical cord,posteromedial cortical amygdaloid nucleus,dorsal motor nucleus of vagus nerve,L5 vertebral cartilage condensation (mouse),subscapularis muscle,hilus of dentate gyrus,...,CA2 field of hippocampus,external female genitalia,ventral part of pharyngeal pouch 3,interdigital region between manual digits 4 and 5,acoustico-facial VII-VIII ganglion complex,inferior vestibular nucleus,liver primordium,internal capsule of telencephalon,layer of neocortex,mesenchyme of rest of paramesonephric duct of female (mouse)
,,,,,,,,,,,,,,,,,,,,,
rhombomere lateral wall,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pons,0.0,1.0,0.000907,0.0036,0.0,0.0,0.00181,0.000905,0.000904,0.006346,...,0.026564,0.004476,0.001808,0.002712,0.00361,0.000907,0.010453,0.002715,0.012456,0.003591
rete ovarii of mesonephros (mouse),0.0,0.000907,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bronchiole,0.0,0.0036,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mesenchyme of umbilical cord,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
attribute_similarity_matix.shape

(3142, 3142)

# Save Attribute Similarity Matrix

In [38]:
filename = path+'bgee_mouse_anatomical_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Similarity Matrix

In [41]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [42]:
gene_similarity_matix.head()

Unnamed: 0,HOXB4,TRNAU1AP,LRBA,RNF181,SNRPF,TSEN34,TGFB1,NTN5,ZDHHC23,DNAJC13,...,ZIC5,MANEAL,ADAM29,ARHGEF3,ZG16,SLC29A4,TSSC4,OR13J1,ITGAM,FKBP7
,,,,,,,,,,,,,,,,,,,,,
HOXB4,1.0,0.319209,0.326316,0.318653,0.09589,0.085714,0.273743,0.062201,0.07619,0.317708,...,0.192691,0.136691,0.019324,0.319672,0.318584,0.177122,0.333333,0.114679,0.085714,0.324468
TRNAU1AP,0.319209,1.0,0.820261,0.801282,0.122605,0.080769,0.413889,0.057692,0.073077,0.814935,...,0.445614,0.387218,0.015385,0.798658,0.754386,0.367647,0.83612,0.129771,0.080769,0.777419
LRBA,0.326316,0.820261,1.0,0.932258,0.096346,0.070707,0.425587,0.050505,0.063973,0.947712,...,0.385802,0.348837,0.013468,0.897351,0.778146,0.349835,0.89644,0.121212,0.070707,0.849057
RNF181,0.318653,0.801282,0.932258,1.0,0.094771,0.069536,0.420103,0.049669,0.062914,0.964052,...,0.371601,0.338762,0.013245,0.907591,0.794702,0.352941,0.888179,0.119205,0.069536,0.86478
SNRPF,0.09589,0.122605,0.096346,0.094771,1.0,0.636364,0.105882,0.454545,0.575758,0.09571,...,0.178344,0.224138,0.121212,0.099644,0.105263,0.239316,0.098976,0.277778,0.636364,0.105802


In [43]:
gene_similarity_matix.shape

(16289, 16289)

# Save Gene Similarity Matrix

In [44]:
filename = path+'bgee_mouse_anatomical_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene-Attribute Edge List

In [39]:
name = 'bgee_mouse_anatomical_gene_attribute_edge_list'

In [40]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  3142 Out of 3142   

 The number of statisticaly relevent gene-attribute associations is: 3163055
