# bgee Human Anatomical Entity

Author: Moshe Silverstein  
Date: 07-18  
Data Source Home: https://bgee.org/     
Data Source Download: https://bgee.org/?page=download&action=expr_calls#id1 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

# Path to Output Files

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/bgee/Output/'

# Load Data

In [4]:
df = pd.read_csv('Input/Homo_sapiens_expr_advanced_development.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0,Gene ID,Gene name,Anatomical entity ID,Anatomical entity name,Developmental stage ID,Developmental stage name,Expression,Call quality,Expression rank,Including observed data,...,In situ hybridization experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including in situ hybridization observed data,RNA-Seq data,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a high quality,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including RNA-Seq observed data
0,ENSG00000000003,TSPAN6,CL:0000015,male germ cell,HsapDv:0000092,human middle aged stage (human),present,silver quality,5940,yes,...,0,0,0,no,no data,0,0,0,0,no
1,ENSG00000000003,TSPAN6,CL:0000019,sperm,HsapDv:0000088,human early adulthood stage (human),present,silver quality,23,yes,...,0,0,0,no,no data,0,0,0,0,no
2,ENSG00000000003,TSPAN6,CL:0000023,oocyte,HsapDv:0000087,human adult stage (human),absent,silver quality,20500,yes,...,0,0,0,no,no data,0,0,0,0,no
3,ENSG00000000003,TSPAN6,CL:0000083,epithelial cell of pancreas,UBERON:0000104,life cycle,present,silver quality,6050,yes,...,0,0,0,no,no data,0,0,0,0,no
4,ENSG00000000003,TSPAN6,CL:0000115,endothelial cell,HsapDv:0000092,human middle aged stage (human),present,silver quality,11800,yes,...,0,0,0,no,no data,0,0,0,0,no


In [6]:
df.shape

(32962486, 32)

# Get relevant data

In [7]:
df = df[['Gene name', 'Anatomical entity name', 'Developmental stage name', 'Expression', 'Call quality']]

In [8]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name,Developmental stage name,Expression,Call quality
0,TSPAN6,male germ cell,human middle aged stage (human),present,silver quality
1,TSPAN6,sperm,human early adulthood stage (human),present,silver quality
2,TSPAN6,oocyte,human adult stage (human),absent,silver quality
3,TSPAN6,epithelial cell of pancreas,life cycle,present,silver quality
4,TSPAN6,endothelial cell,human middle aged stage (human),present,silver quality


In [9]:
df = df[df['Expression'] == 'present']

In [10]:
df = df[['Gene name', 'Anatomical entity name']]

In [11]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name
0,TSPAN6,male germ cell
1,TSPAN6,sperm
3,TSPAN6,epithelial cell of pancreas
4,TSPAN6,endothelial cell
7,TSPAN6,leukocyte


In [12]:
df.shape

(19631082, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [13]:
df.set_index('Gene name', inplace=True)

In [14]:
uf.mapgenesymbols(df)

Progeres: 10%  2028828 Out of 19631082   

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progeres: 99%  19630607 Out of 19631082   

# Drop Duplicates

In [17]:
df.reset_index(inplace=True)

In [18]:
df.drop_duplicates(inplace=True)

In [19]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name
0,TSPAN6,male germ cell
1,TSPAN6,sperm
2,TSPAN6,epithelial cell of pancreas
3,TSPAN6,endothelial cell
4,TSPAN6,leukocyte


In [20]:
df.shape

(4308050, 2)

# Create Binary Matrix

In [21]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  34668 Out of 34668   

In [22]:
binary_matrix.head()

Unnamed: 0,dorsal root ganglion,quadriceps femoris,pleura,saliva-secreting gland,Brodmann (1909) area 23,fundus of stomach,colonic mucosa,dorsolateral prefrontal cortex,left uterine tube,biceps brachii,...,visceral pleura,thyroid gland,pituitary gland,tibial nerve,cervix epithelium,superficial temporal artery,lower esophagus mucosa,esophagus mucosa,oocyte,tibia
AQP7P1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
ASS1P10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
LINC01496,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
CFH,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
SNORD96B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [23]:
binary_matrix.shape

(34668, 308)

# Save Binary Matrix

In [24]:
filename = path+'bgee_human_anatomical_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [25]:
name = 'bgee_human_anatomical_gene_set'

In [26]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  308 Out of 308   

# Create Attribute Library

In [27]:
name = 'bgee_human_anatomical_attribute_set'

In [28]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  34668 Out of 34668   

# Create Gene List

In [29]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  34668 Out of 34668   

In [30]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,AQP7P1,375719.0
1,ASS1P10,455.0
2,LINC01496,102723000.0
3,CFH,3075.0
4,SNORD96B,692226.0


In [31]:
gene_list.shape

(34668, 2)

# Save Gene List

In [32]:
filename = path+'bgee_human_anatomical_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [33]:
attribute_list = uf.createAttributeList(binary_matrix)

In [34]:
attribute_list.head()

dorsal root ganglion
quadriceps femoris
pleura
saliva-secreting gland
Brodmann (1909) area 23


In [35]:
attribute_list.shape

(308, 0)

# Save Attribute List

In [36]:
filename = path+'bgee_human_anatomical_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute Similarity matrix

In [37]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [38]:
attribute_similarity_matix.head()

Unnamed: 0,dorsal root ganglion,quadriceps femoris,pleura,saliva-secreting gland,Brodmann (1909) area 23,fundus of stomach,colonic mucosa,dorsolateral prefrontal cortex,left uterine tube,biceps brachii,...,visceral pleura,thyroid gland,pituitary gland,tibial nerve,cervix epithelium,superficial temporal artery,lower esophagus mucosa,esophagus mucosa,oocyte,tibia
,,,,,,,,,,,,,,,,,,,,,
dorsal root ganglion,1.0,0.790052,0.635409,0.691557,0.801105,0.529852,0.740784,0.436862,0.552239,0.796877,...,0.795575,0.430613,0.457686,0.416033,0.588608,0.68464,0.616856,0.416108,0.572003,0.79556
quadriceps femoris,0.790052,1.0,0.618021,0.72835,0.751667,0.554178,0.767797,0.462397,0.574602,0.879464,...,0.774268,0.458539,0.482007,0.441707,0.569185,0.643732,0.635057,0.442069,0.553612,0.772187
pleura,0.635409,0.618021,1.0,0.544827,0.596111,0.415475,0.591384,0.339121,0.438192,0.629506,...,0.702801,0.336791,0.356257,0.324827,0.802813,0.593532,0.492303,0.326988,0.500127,0.650086
saliva-secreting gland,0.691557,0.72835,0.544827,1.0,0.702379,0.693158,0.864609,0.597126,0.70056,0.684437,...,0.698678,0.596623,0.62042,0.582468,0.480879,0.51718,0.710224,0.586597,0.486699,0.688437
Brodmann (1909) area 23,0.801105,0.751667,0.596111,0.702379,1.0,0.534606,0.73645,0.450177,0.554763,0.753665,...,0.76366,0.438311,0.469287,0.425776,0.53505,0.61468,0.608746,0.424728,0.572179,0.774411


# Save Attribute Similarity Matrix

In [39]:
filename = path+'bgee_human_anatomical_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Similarity Matrix

In [42]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [43]:
gene_similarity_matix.head()

Unnamed: 0,AQP7P1,ASS1P10,LINC01496,CFH,SNORD96B,RN7SL339P,RNU6-1338P,KMT2E-AS1,TEKT4,SON,...,ZNF223,FAM89B,COX5B,TMED5,RSL24D1P2,MIR4261,PGM1,PTH,HSPB11,RNU6-1040P
,,,,,,,,,,,,,,,,,,,,,
AQP7P1,1.0,0.369048,0.7,0.342857,0.634409,0.711111,0.534091,0.865979,0.860215,0.3,...,0.521739,0.857143,0.309963,0.313433,0.44186,0.517241,0.307692,0.574074,0.312268,0.271739
ASS1P10,0.369048,1.0,0.333333,0.126531,0.394366,0.402778,0.344262,0.319588,0.348315,0.110714,...,0.192547,0.316327,0.114391,0.115672,0.365385,0.410714,0.113553,0.3,0.115242,0.207547
LINC01496,0.7,0.333333,1.0,0.281633,0.75641,0.7375,0.621622,0.693878,0.698925,0.246429,...,0.428571,0.704082,0.254613,0.257463,0.453333,0.56,0.252747,0.55,0.256506,0.378378
CFH,0.342857,0.126531,0.281633,1.0,0.277551,0.285714,0.208163,0.395918,0.363265,0.875,...,0.650407,0.4,0.869565,0.872263,0.163265,0.195918,0.876812,0.35102,0.862319,0.134694
SNORD96B,0.634409,0.394366,0.75641,0.277551,1.0,0.725,0.565789,0.683673,0.688172,0.242857,...,0.42236,0.693878,0.250923,0.253731,0.459459,0.633803,0.249084,0.524752,0.252788,0.442857


# Save Gene Similarity Matrix

In [44]:
filename = path+'bgee_human_anatomical_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene-Attribute Edge List

In [40]:
name = 'bgee_human_anatomical_gene_attribute_edge_list'

In [41]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  308 Out of 308   

 The number of statisticaly relevent gene-attribute associations is: 4308050
