# bgee Rat Anatomical Entity

Author: Moshe Silverstein  
Date: 08-18  
Data Source Home: https://bgee.org/     
Data Source Download: https://bgee.org/?page=download&action=expr_calls#id1 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

# Path to Output Files

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/bgee/Output/'

# Load Data

In [4]:
df = pd.read_csv('Input/Rattus_norvegicus_expr_advanced_development.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0,Gene ID,Gene name,Anatomical entity ID,Anatomical entity name,Developmental stage ID,Developmental stage name,Expression,Call quality,Expression rank,Including observed data,...,In situ hybridization experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including in situ hybridization observed data,RNA-Seq data,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a high quality,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including RNA-Seq observed data
0,ENSRNOG00000000001,AABR07013255.1,UBERON:0000082,adult mammalian kidney,UBERON:0000113,post-juvenile adult stage,present,gold quality,17400,yes,...,0,0,0,no,present,2,0,0,0,yes
1,ENSRNOG00000000001,AABR07013255.1,UBERON:0000473,testis,UBERON:0000113,post-juvenile adult stage,absent,silver quality,22800,yes,...,0,0,0,no,absent,0,0,1,0,yes
2,ENSRNOG00000000001,AABR07013255.1,UBERON:0000948,heart,UBERON:0000113,post-juvenile adult stage,present,silver quality,13400,yes,...,0,0,0,no,present,1,0,0,0,yes
3,ENSRNOG00000000001,AABR07013255.1,UBERON:0000955,brain,UBERON:0000113,post-juvenile adult stage,present,gold quality,16400,yes,...,0,0,0,no,present,2,0,0,0,yes
4,ENSRNOG00000000001,AABR07013255.1,UBERON:0001134,skeletal muscle tissue,UBERON:0000113,post-juvenile adult stage,present,silver quality,13400,yes,...,0,0,0,no,present,1,0,0,0,yes


In [6]:
df.shape

(272914, 32)

# Get relevant data

In [7]:
df = df[['Gene name', 'Anatomical entity name', 'Developmental stage name', 'Expression', 'Call quality']]

In [8]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name,Developmental stage name,Expression,Call quality
0,AABR07013255.1,adult mammalian kidney,post-juvenile adult stage,present,gold quality
1,AABR07013255.1,testis,post-juvenile adult stage,absent,silver quality
2,AABR07013255.1,heart,post-juvenile adult stage,present,silver quality
3,AABR07013255.1,brain,post-juvenile adult stage,present,gold quality
4,AABR07013255.1,skeletal muscle tissue,post-juvenile adult stage,present,silver quality


In [9]:
df = df[df['Expression'] == 'present']

In [10]:
df = df[['Gene name', 'Anatomical entity name']]

In [11]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name
0,AABR07013255.1,adult mammalian kidney
2,AABR07013255.1,heart
3,AABR07013255.1,brain
4,AABR07013255.1,skeletal muscle tissue
5,AABR07013255.1,colon


In [12]:
df.shape

(201567, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [13]:
df.set_index('Gene name', inplace=True)

In [14]:
uf.mapgenesymbols(df)

Progeres: 100%  201567 Out of 201567   

# Drop Duplicates

In [15]:
df.reset_index(inplace=True)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name
0,GAD1,testis
1,GAD1,heart
2,GAD1,brain
3,GAD1,colon
4,GAD1,Ammon's horn


In [18]:
df.shape

(132835, 2)

# Create Binary Matrix

In [19]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  15799 Out of 15799   

In [20]:
binary_matrix.head()

Unnamed: 0,heart,liver,spleen,lung,Ammon's horn,brain,testis,skeletal muscle tissue,colon,adult mammalian kidney
AMDHD2,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
CLTB,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CIAO1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
NFIA,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
BUD23,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [21]:
binary_matrix.shape

(15799, 10)

# Save Binary Matrix

In [22]:
filename = path+'bgee_rat_anatomical_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [23]:
name = 'bgee_rat_anatomical_gene_set'

In [24]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  10 Out of 10   

# Create Attribute Library

In [25]:
name = 'bgee_rat_anatomical_attribute_set'

In [26]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  15799 Out of 15799   

# Create Gene List

In [27]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  15799 Out of 15799   

In [28]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,AMDHD2,51005
1,CLTB,1212
2,CIAO1,9391
3,NFIA,4774
4,BUD23,114049


In [29]:
gene_list.shape

(15799, 2)

# Save Gene List

In [30]:
filename = path+'bgee_rat_anatomical_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [31]:
attribute_list = uf.createAttributeList(binary_matrix)

In [32]:
attribute_list.head()

heart
liver
spleen
lung
Ammon's horn


In [33]:
attribute_list.shape

(10, 0)

# Save Attribute List

In [34]:
filename = path+'bgee_rat_anatomical_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute Similarity matrix

In [35]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [36]:
attribute_similarity_matix.head()

Unnamed: 0,heart,liver,spleen,lung,Ammon's horn,brain,testis,skeletal muscle tissue,colon,adult mammalian kidney
,,,,,,,,,,
heart,1.0,0.888312,0.902607,0.910264,0.468741,0.88105,0.857143,0.913377,0.89388,0.901816
liver,0.888312,1.0,0.897345,0.890806,0.467595,0.858542,0.844225,0.871915,0.884728,0.894913
spleen,0.902607,0.897345,1.0,0.910662,0.465066,0.871564,0.85446,0.888843,0.905771,0.896272
lung,0.910264,0.890806,0.910662,1.0,0.455819,0.897146,0.873434,0.896969,0.916172,0.915515
Ammon's horn,0.468741,0.467595,0.465066,0.455819,1.0,0.453462,0.451111,0.463616,0.455767,0.449932


In [37]:
attribute_similarity_matix.shape

(10, 10)

# Save Attribute Similarity Matrix

In [38]:
filename = path+'bgee_rat_anatomical_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Similarity Matrix

In [39]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [40]:
gene_similarity_matix.head()

Unnamed: 0,AMDHD2,CLTB,CIAO1,NFIA,BUD23,ADAMTSL5,CENPQ,OSMR,HSD3B7,ST8SIA4,...,SCRT2,CRYGN,ETV1,MAPK8IP3,NACC1,ITGB1,RTCA,CD200R1,RREB1,SEC14L2
,,,,,,,,,,,,,,,,,,,,,
AMDHD2,1.0,0.9,0.9,0.9,1.0,1.0,1.0,1.0,0.9,1.0,...,0.222222,0.111111,1.0,1.0,0.9,0.9,0.9,0.888889,0.9,0.9
CLTB,0.9,1.0,1.0,1.0,0.9,0.9,0.9,0.9,1.0,0.9,...,0.2,0.1,0.9,0.9,1.0,1.0,1.0,0.8,1.0,1.0
CIAO1,0.9,1.0,1.0,1.0,0.9,0.9,0.9,0.9,1.0,0.9,...,0.2,0.1,0.9,0.9,1.0,1.0,1.0,0.8,1.0,1.0
NFIA,0.9,1.0,1.0,1.0,0.9,0.9,0.9,0.9,1.0,0.9,...,0.2,0.1,0.9,0.9,1.0,1.0,1.0,0.8,1.0,1.0
BUD23,1.0,0.9,0.9,0.9,1.0,1.0,1.0,1.0,0.9,1.0,...,0.222222,0.111111,1.0,1.0,0.9,0.9,0.9,0.888889,0.9,0.9


In [41]:
gene_similarity_matix.shape

(15799, 15799)

# Save Gene Similarity Matrix

In [42]:
filename = path+'bgee_rat_anatomical_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene-Attribute Edge List

In [43]:
name = 'bgee_rat_anatomical_gene_attribute_edge_list'

In [44]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  10 Out of 10   

 The number of statisticaly relevent gene-attribute associations is: 132835
