# bgee Human Developmental Stage

Author: Moshe Silverstein  
Date: 07-18  
Data Source Home: https://bgee.org/     
Data Source Download: https://bgee.org/?page=download&action=expr_calls#id1 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

# Path to Output Files

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/bgee/Output/'

# Load Data

In [4]:
df = pd.read_csv('Input/Homo_sapiens_expr_advanced_development.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0,Gene ID,Gene name,Anatomical entity ID,Anatomical entity name,Developmental stage ID,Developmental stage name,Expression,Call quality,Expression rank,Including observed data,...,In situ hybridization experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including in situ hybridization observed data,RNA-Seq data,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a high quality,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including RNA-Seq observed data
0,ENSG00000000003,TSPAN6,CL:0000015,male germ cell,HsapDv:0000092,human middle aged stage (human),present,silver quality,5940,yes,...,0,0,0,no,no data,0,0,0,0,no
1,ENSG00000000003,TSPAN6,CL:0000019,sperm,HsapDv:0000088,human early adulthood stage (human),present,silver quality,23,yes,...,0,0,0,no,no data,0,0,0,0,no
2,ENSG00000000003,TSPAN6,CL:0000023,oocyte,HsapDv:0000087,human adult stage (human),absent,silver quality,20500,yes,...,0,0,0,no,no data,0,0,0,0,no
3,ENSG00000000003,TSPAN6,CL:0000083,epithelial cell of pancreas,UBERON:0000104,life cycle,present,silver quality,6050,yes,...,0,0,0,no,no data,0,0,0,0,no
4,ENSG00000000003,TSPAN6,CL:0000115,endothelial cell,HsapDv:0000092,human middle aged stage (human),present,silver quality,11800,yes,...,0,0,0,no,no data,0,0,0,0,no


In [6]:
df.shape

(32962486, 32)

# Get relevant data

In [7]:
df = df[['Gene name', 'Anatomical entity name', 'Developmental stage name', 'Expression', 'Call quality']]

In [8]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name,Developmental stage name,Expression,Call quality
0,TSPAN6,male germ cell,human middle aged stage (human),present,silver quality
1,TSPAN6,sperm,human early adulthood stage (human),present,silver quality
2,TSPAN6,oocyte,human adult stage (human),absent,silver quality
3,TSPAN6,epithelial cell of pancreas,life cycle,present,silver quality
4,TSPAN6,endothelial cell,human middle aged stage (human),present,silver quality


In [9]:
df = df[df['Expression'] == 'present']

In [10]:
df = df[['Gene name', 'Developmental stage name']]

In [11]:
df.head()

Unnamed: 0,Gene name,Developmental stage name
0,TSPAN6,human middle aged stage (human)
1,TSPAN6,human early adulthood stage (human)
3,TSPAN6,life cycle
4,TSPAN6,human middle aged stage (human)
7,TSPAN6,human middle aged stage (human)


In [12]:
df.shape

(19631082, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [13]:
df.set_index('Gene name', inplace=True)

In [14]:
uf.mapgenesymbols(df)

Progeres: 100%  19631082 Out of 19631082   

# Drop Duplicates

In [16]:
df.reset_index(inplace=True)

In [17]:
df.drop_duplicates(inplace=True)

In [18]:
df.head()

Unnamed: 0,Gene name,Developmental stage name
0,TSPAN6,human middle aged stage (human)
1,TSPAN6,human early adulthood stage (human)
2,TSPAN6,life cycle
5,TSPAN6,young adult stage (human)
6,TSPAN6,25-44 year-old human stage (human)


In [19]:
df.shape

(603839, 2)

# Create Binary Matrix

In [20]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  34668 Out of 34668   

In [21]:
binary_matrix.head()

Unnamed: 0,CS17,65-79 year-old human stage (human),sixth LMP month human stage (human),juvenile stage,human aged stage,human middle aged stage (human),2-5 year-old child stage (human),38th week post-fertilization and over human stage (human),6-12 year-old child stage (human),Carnegie stage 10 (human),...,CS09,infant stage,ninth LMP month human stage (human),adolescent stage (human),Carnegie stage 12 (human),Carnegie stage 14 (human),80 year-old and over human stage (human),fifth LMP month human stage (human),young adult stage (human),CS11
IGLV3-17,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
EMC4,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
CRISP2,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
ODC1,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
NAT16,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


In [22]:
binary_matrix.shape

(34668, 33)

# Save Binary Matrix

In [23]:
filename = path+'bgee_human_developmental_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [24]:
name = 'bgee_human_developmental_gene_set'

In [25]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  33 Out of 33   

# Create Attribute Library

In [26]:
name = 'bgee_human_developmental_attribute_set'

In [27]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  34668 Out of 34668   

# Create Gene List

In [28]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  34668 Out of 34668   

In [29]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,IGLV3-17,28798
1,EMC4,51234
2,CRISP2,7180
3,ODC1,4953
4,NAT16,375607


In [30]:
gene_list.shape

(34668, 2)

# Save Gene List

In [31]:
filename = path+'bgee_human_developmental_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [32]:
attribute_list = uf.createAttributeList(binary_matrix)

In [33]:
attribute_list.head()

CS17
65-79 year-old human stage (human)
sixth LMP month human stage (human)
juvenile stage
human aged stage


In [34]:
attribute_list.shape

(33, 0)

# Save Attribute List

In [35]:
filename = path+'bgee_human_developmental_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute Similarity matrix

In [36]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [37]:
attribute_similarity_matix.head()

Unnamed: 0,CS17,65-79 year-old human stage (human),sixth LMP month human stage (human),juvenile stage,human aged stage,human middle aged stage (human),2-5 year-old child stage (human),38th week post-fertilization and over human stage (human),6-12 year-old child stage (human),Carnegie stage 10 (human),...,CS09,infant stage,ninth LMP month human stage (human),adolescent stage (human),Carnegie stage 12 (human),Carnegie stage 14 (human),80 year-old and over human stage (human),fifth LMP month human stage (human),young adult stage (human),CS11
,,,,,,,,,,,,,,,,,,,,,
CS17,1.0,0.000194,0.000237,0.000268,0.000341,0.000177,0.000242,0.000235,0.000247,0.000467,...,0.000479,0.000265,0.000242,0.000228,0.000502,0.00047,0.000206,0.000229,0.000188,0.000487
65-79 year-old human stage (human),0.000194,1.0,0.273384,0.48255,0.475242,0.909633,0.656132,0.413572,0.646233,0.208155,...,0.202714,0.60956,0.401885,0.69028,0.193516,0.206503,0.775402,0.802237,0.829965,0.199411
sixth LMP month human stage (human),0.000237,0.273384,1.0,0.549144,0.566222,0.249667,0.407094,0.602567,0.415458,0.566702,...,0.555556,0.446004,0.620291,0.38294,0.560004,0.580817,0.347609,0.322742,0.318168,0.561617
juvenile stage,0.000268,0.48255,0.549144,1.0,0.840027,0.441214,0.68659,0.807281,0.70892,0.426452,...,0.415319,0.750609,0.793923,0.644717,0.397178,0.421239,0.604038,0.558849,0.558041,0.409067
human aged stage,0.000341,0.475242,0.566222,0.840027,1.0,0.434102,0.674379,0.796494,0.685922,0.432401,...,0.421288,0.744551,0.805611,0.6416,0.402866,0.42894,0.598637,0.551368,0.551389,0.414652


In [39]:
attribute_similarity_matix.shape

(33, 33)

# Save Attribute Similarity Matrix

In [38]:
filename = path+'bgee_human_developmental_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Similarity Matrix

In [42]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [43]:
gene_similarity_matix.head()

Unnamed: 0,IGLV3-17,EMC4,CRISP2,ODC1,NAT16,TMLHE-AS1,SH3BP2,RNU6-245P,PSPC1,LINC00443,...,HIST2H4A,MIR6835,FDX2,CRYBA2,CTAGE15,SNX20,NDUFB9P1,GPR182,ICOSLG,ALOX15B
,,,,,,,,,,,,,,,,,,,,,
IGLV3-17,1.0,0.32,0.380952,0.258065,0.363636,0.5,0.32,0.538462,0.258065,0.875,...,0.5,0.8,0.5,0.571429,0.666667,0.333333,0.6,0.444444,0.421053,0.470588
EMC4,0.32,1.0,0.84,0.806452,0.88,0.64,1.0,0.48,0.806452,0.28,...,0.64,0.4,0.64,0.56,0.48,0.96,0.32,0.72,0.76,0.68
CRISP2,0.380952,0.84,1.0,0.677419,0.869565,0.681818,0.84,0.5,0.677419,0.333333,...,0.681818,0.47619,0.681818,0.590909,0.5,0.875,0.318182,0.772727,0.818182,0.809524
ODC1,0.258065,0.806452,0.677419,1.0,0.709677,0.516129,0.806452,0.387097,1.0,0.225806,...,0.516129,0.322581,0.516129,0.451613,0.387097,0.774194,0.258065,0.580645,0.612903,0.548387
NAT16,0.363636,0.88,0.869565,0.709677,1.0,0.652174,0.88,0.478261,0.709677,0.318182,...,0.652174,0.454545,0.652174,0.636364,0.545455,0.916667,0.363636,0.818182,0.782609,0.772727


# Save Gene Similarity Matrix

In [44]:
filename = path+'bgee_human_developmental_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene-Attribute Edge List

In [40]:
name = 'bgee_human_developmental_gene_attribute_edge_list'

In [41]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  33 Out of 33   

 The number of statisticaly relevent gene-attribute associations is: 603839
