# bgee Mouse Developmental Stage

Author: Moshe Silverstein  
Date: 08-18  
Data Source Home: https://bgee.org/     
Data Source Download: https://bgee.org/?page=download&action=expr_calls#id1 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

# Path to Output Files

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/bgee/Output/'

# Load Data

In [4]:
df = pd.read_csv('Input/Mus_musculus_expr_advanced_development.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0,Gene ID,Gene name,Anatomical entity ID,Anatomical entity name,Developmental stage ID,Developmental stage name,Expression,Call quality,Expression rank,Including observed data,...,In situ hybridization experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including in situ hybridization observed data,RNA-Seq data,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a high quality,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including RNA-Seq observed data
0,ENSMUSG00000000001,Gnai3,CL:0000023,oocyte,UBERON:0000104,life cycle,present,gold quality,1120,yes,...,0,0,0,no,no data,0,0,0,0,no
1,ENSMUSG00000000001,Gnai3,CL:0000025,egg cell,UBERON:0000113,post-juvenile adult stage,present,silver quality,18600,yes,...,0,0,0,no,no data,0,0,0,0,no
2,ENSMUSG00000000001,Gnai3,CL:0000057,fibroblast,MmusDv:0000052,8 weeks (mouse),present,silver quality,1850,yes,...,0,0,0,no,present,1,0,0,0,yes
3,ENSMUSG00000000001,Gnai3,CL:0000365,zygote,UBERON:0000106,zygote stage,present,gold quality,1410,yes,...,0,0,0,no,no data,0,0,0,0,no
4,ENSMUSG00000000001,Gnai3,CL:0000510,paneth cell,MmusDv:0000050,6 weeks (mouse),present,silver quality,1820,yes,...,0,0,0,no,no data,0,0,0,0,no


In [6]:
df.shape

(15706235, 32)

# Get relevant data

In [7]:
df = df[['Gene name', 'Anatomical entity name', 'Developmental stage name', 'Expression', 'Call quality']]

In [8]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name,Developmental stage name,Expression,Call quality
0,Gnai3,oocyte,life cycle,present,gold quality
1,Gnai3,egg cell,post-juvenile adult stage,present,silver quality
2,Gnai3,fibroblast,8 weeks (mouse),present,silver quality
3,Gnai3,zygote,zygote stage,present,gold quality
4,Gnai3,paneth cell,6 weeks (mouse),present,silver quality


In [9]:
df = df[df['Expression'] == 'present']

In [10]:
df = df[['Gene name', 'Developmental stage name']]

In [11]:
df.head()

Unnamed: 0,Gene name,Developmental stage name
0,Gnai3,life cycle
1,Gnai3,post-juvenile adult stage
2,Gnai3,8 weeks (mouse)
3,Gnai3,zygote stage
4,Gnai3,6 weeks (mouse)


In [12]:
df.shape

(10506933, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [13]:
df.set_index('Gene name', inplace=True)

In [14]:
uf.mapgenesymbols(df)

Progeres: 100%  10506933 Out of 10506933   

# Drop Duplicates

In [15]:
df.reset_index(inplace=True)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df.head()

Unnamed: 0,Gene name,Developmental stage name
0,GNAI3,life cycle
1,GNAI3,post-juvenile adult stage
2,GNAI3,8 weeks (mouse)
3,GNAI3,zygote stage
4,GNAI3,6 weeks (mouse)


In [18]:
df.shape

(660778, 2)

# Create Binary Matrix

In [19]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  16289 Out of 16289   

In [20]:
binary_matrix.head()

Unnamed: 0,10 weeks (mouse),2 month-old stage (mouse),Theiler stage 06 (mouse),late embryonic stage,Theiler stage 13 (mouse),Theiler stage 16 (mouse),Theiler stage 04 (mouse),cleavage stage,Theiler stage 10 (mouse),middle aged stage (mouse),...,Theiler stage 07 (mouse),Theiler stage 14 (mouse),6 weeks (mouse),gastrula stage,Theiler stage 02 (mouse),Theiler stage 21 (mouse),2 weeks (mouse),Theiler stage 24 (mouse),Theiler stage 19 (mouse),Theiler stage 25 (mouse)
CHTF8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
NLGN3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
ATP8B3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
EDRF1,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
VAC14,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
binary_matrix.shape

(16289, 58)

# Save Binary Matrix

In [22]:
filename = path+'bgee_mouse_developmental_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [23]:
name = 'bgee_mouse_developmental_gene_set'

In [24]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  58 Out of 58   

# Create Attribute Library

In [25]:
name = 'bgee_mouse_developmental_attribute_set'

In [26]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  16289 Out of 16289   

# Create Gene List

In [27]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  16289 Out of 16289   

In [28]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,CHTF8,54921
1,NLGN3,54413
2,ATP8B3,148229
3,EDRF1,26098
4,VAC14,55697


In [29]:
gene_list.shape

(16289, 2)

# Save Gene List

In [30]:
filename = path+'bgee_mouse_developmental_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [31]:
attribute_list = uf.createAttributeList(binary_matrix)

In [32]:
attribute_list.head()

10 weeks (mouse)
2 month-old stage (mouse)
Theiler stage 06 (mouse)
late embryonic stage
Theiler stage 13 (mouse)


In [33]:
attribute_list.shape

(58, 0)

# Save Attribute List

In [34]:
filename = path+'bgee_mouse_developmental_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute Similarity matrix

In [35]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [36]:
attribute_similarity_matix.head()

Unnamed: 0,10 weeks (mouse),2 month-old stage (mouse),Theiler stage 06 (mouse),late embryonic stage,Theiler stage 13 (mouse),Theiler stage 16 (mouse),Theiler stage 04 (mouse),cleavage stage,Theiler stage 10 (mouse),middle aged stage (mouse),...,Theiler stage 07 (mouse),Theiler stage 14 (mouse),6 weeks (mouse),gastrula stage,Theiler stage 02 (mouse),Theiler stage 21 (mouse),2 weeks (mouse),Theiler stage 24 (mouse),Theiler stage 19 (mouse),Theiler stage 25 (mouse)
,,,,,,,,,,,,,,,,,,,,,
10 weeks (mouse),1.0,0.843117,0.276083,0.85708,0.817138,0.771833,0.719219,0.000897,0.13836,0.935659,...,0.027299,0.834025,0.932636,7.5e-05,0.723626,0.902056,0.918049,0.912507,0.882626,0.907062
2 month-old stage (mouse),0.843117,1.0,0.235828,0.741472,0.723245,0.678918,0.644767,0.000946,0.121178,0.843307,...,0.025348,0.744484,0.891614,6.3e-05,0.657923,0.859268,0.899565,0.875031,0.815312,0.826369
Theiler stage 06 (mouse),0.276083,0.235828,1.0,0.305619,0.314508,0.333578,0.363299,0.0016,0.209223,0.276688,...,0.063415,0.307001,0.263567,0.0,0.334903,0.272661,0.261204,0.267706,0.285824,0.279114
late embryonic stage,0.85708,0.741472,0.305619,1.0,0.861514,0.831938,0.75006,0.001105,0.155464,0.855382,...,0.030419,0.858912,0.824143,0.0,0.732069,0.854358,0.819319,0.84423,0.870619,0.884202
Theiler stage 13 (mouse),0.817138,0.723245,0.314508,0.861514,1.0,0.855273,0.764672,0.001219,0.165638,0.815897,...,0.033481,0.90062,0.791262,8.7e-05,0.737159,0.823222,0.789327,0.811314,0.846293,0.843537


In [37]:
attribute_similarity_matix.shape

(58, 58)

# Save Attribute Similarity Matrix

In [38]:
filename = path+'bgee_mouse_developmental_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Similarity Matrix

In [41]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [42]:
gene_similarity_matix.head()

Unnamed: 0,CHTF8,NLGN3,ATP8B3,EDRF1,VAC14,F2R,HIST1H4I,LRRC31,GPR37,OSBPL11,...,SAA1,KCNN4,LYPD3,CCNF,SMIM20,UTS2B,RGS16,C16orf74,DLL4,SARNP
,,,,,,,,,,,,,,,,,,,,,
CHTF8,1.0,0.735849,0.396226,0.924528,0.925926,0.981132,0.830189,0.54717,0.811321,0.944444,...,0.830189,0.90566,0.811321,0.888889,0.907407,0.113208,0.924528,0.264151,0.849057,0.226415
NLGN3,0.735849,1.0,0.538462,0.795918,0.764706,0.75,0.804348,0.581395,0.863636,0.75,...,0.804348,0.8125,0.822222,0.72549,0.78,0.153846,0.795918,0.358974,0.826087,0.307692
ATP8B3,0.396226,0.538462,1.0,0.428571,0.411765,0.403846,0.477273,0.470588,0.454545,0.403846,...,0.444444,0.4375,0.488372,0.428571,0.42,0.173913,0.428571,0.25,0.434783,0.222222
EDRF1,0.924528,0.795918,0.428571,1.0,0.960784,0.942308,0.897959,0.591837,0.877551,0.942308,...,0.897959,0.979592,0.877551,0.921569,0.941176,0.122449,1.0,0.285714,0.88,0.244898
VAC14,0.925926,0.764706,0.411765,0.960784,1.0,0.943396,0.862745,0.568627,0.843137,0.980769,...,0.862745,0.941176,0.843137,0.960784,0.980392,0.117647,0.960784,0.27451,0.846154,0.235294


In [43]:
gene_similarity_matix.shape

(16289, 16289)

# Save Gene Similarity Matrix

In [44]:
filename = path+'bgee_mouse_developmental_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene-Attribute Edge List

In [39]:
name = 'bgee_mouse_developmental_gene_attribute_edge_list'

In [40]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  58 Out of 58   

 The number of statisticaly relevent gene-attribute associations is: 660778
