# bgee Rat Sample

Author: Moshe Silverstein  
Date: 08-18  
Data Source Home: https://bgee.org/     
Data Source Download: https://bgee.org/?page=download&action=expr_calls#id1 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

# Path to Output Files

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/bgee/Output/'

# Load Data

In [4]:
df = pd.read_csv('Input/Rattus_norvegicus_expr_advanced_development.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0,Gene ID,Gene name,Anatomical entity ID,Anatomical entity name,Developmental stage ID,Developmental stage name,Expression,Call quality,Expression rank,Including observed data,...,In situ hybridization experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,In situ hybridization experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including in situ hybridization observed data,RNA-Seq data,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a high quality,RNA-Seq experiment count showing expression of this gene in this condition or in sub-conditions with a low quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a high quality,RNA-Seq experiment count showing absence of expression of this gene in this condition or valid parent conditions with a low quality,Including RNA-Seq observed data
0,ENSRNOG00000000001,AABR07013255.1,UBERON:0000082,adult mammalian kidney,UBERON:0000113,post-juvenile adult stage,present,gold quality,17400,yes,...,0,0,0,no,present,2,0,0,0,yes
1,ENSRNOG00000000001,AABR07013255.1,UBERON:0000473,testis,UBERON:0000113,post-juvenile adult stage,absent,silver quality,22800,yes,...,0,0,0,no,absent,0,0,1,0,yes
2,ENSRNOG00000000001,AABR07013255.1,UBERON:0000948,heart,UBERON:0000113,post-juvenile adult stage,present,silver quality,13400,yes,...,0,0,0,no,present,1,0,0,0,yes
3,ENSRNOG00000000001,AABR07013255.1,UBERON:0000955,brain,UBERON:0000113,post-juvenile adult stage,present,gold quality,16400,yes,...,0,0,0,no,present,2,0,0,0,yes
4,ENSRNOG00000000001,AABR07013255.1,UBERON:0001134,skeletal muscle tissue,UBERON:0000113,post-juvenile adult stage,present,silver quality,13400,yes,...,0,0,0,no,present,1,0,0,0,yes


In [6]:
df.shape

(272914, 32)

# Get relevant data

In [7]:
df = df[['Gene name', 'Anatomical entity name', 'Developmental stage name', 'Expression', 'Call quality']]

In [8]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name,Developmental stage name,Expression,Call quality
0,AABR07013255.1,adult mammalian kidney,post-juvenile adult stage,present,gold quality
1,AABR07013255.1,testis,post-juvenile adult stage,absent,silver quality
2,AABR07013255.1,heart,post-juvenile adult stage,present,silver quality
3,AABR07013255.1,brain,post-juvenile adult stage,present,gold quality
4,AABR07013255.1,skeletal muscle tissue,post-juvenile adult stage,present,silver quality


In [9]:
df = df[df['Expression'] == 'present']

In [10]:
df['Sample'] = df['Anatomical entity name'].map(str) +'_'+ df['Developmental stage name'].map(str)

In [11]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name,Developmental stage name,Expression,Call quality,Sample
0,AABR07013255.1,adult mammalian kidney,post-juvenile adult stage,present,gold quality,adult mammalian kidney_post-juvenile adult stage
2,AABR07013255.1,heart,post-juvenile adult stage,present,silver quality,heart_post-juvenile adult stage
3,AABR07013255.1,brain,post-juvenile adult stage,present,gold quality,brain_post-juvenile adult stage
4,AABR07013255.1,skeletal muscle tissue,post-juvenile adult stage,present,silver quality,skeletal muscle tissue_post-juvenile adult stage
5,AABR07013255.1,colon,post-juvenile adult stage,present,silver quality,colon_post-juvenile adult stage


In [12]:
meta_data = df[['Sample', 'Anatomical entity name', 'Developmental stage name']].copy()

In [13]:
meta_data.set_index('Sample', inplace=True)

In [14]:
meta_data.drop_duplicates(inplace=True)

In [15]:
meta_data.head()

Unnamed: 0_level_0,Anatomical entity name,Developmental stage name
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1
adult mammalian kidney_post-juvenile adult stage,adult mammalian kidney,post-juvenile adult stage
heart_post-juvenile adult stage,heart,post-juvenile adult stage
brain_post-juvenile adult stage,brain,post-juvenile adult stage
skeletal muscle tissue_post-juvenile adult stage,skeletal muscle tissue,post-juvenile adult stage
colon_post-juvenile adult stage,colon,post-juvenile adult stage


In [16]:
df = df[['Gene name', 'Sample']]

In [17]:
df.head()

Unnamed: 0,Gene name,Sample
0,AABR07013255.1,adult mammalian kidney_post-juvenile adult stage
2,AABR07013255.1,heart_post-juvenile adult stage
3,AABR07013255.1,brain_post-juvenile adult stage
4,AABR07013255.1,skeletal muscle tissue_post-juvenile adult stage
5,AABR07013255.1,colon_post-juvenile adult stage


In [18]:
df.shape

(201567, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [19]:
df.set_index('Gene name', inplace=True)

In [20]:
uf.mapgenesymbols(df)

Progeres: 100%  201567 Out of 201567   

# Drop Duplicates

In [21]:
df.reset_index(inplace=True)

In [22]:
df.drop_duplicates(inplace=True)

In [23]:
df.head()

Unnamed: 0,Gene name,Sample
0,GAD1,testis_post-juvenile adult stage
1,GAD1,heart_post-juvenile adult stage
2,GAD1,brain_post-juvenile adult stage
3,GAD1,colon_post-juvenile adult stage
4,GAD1,Ammon's horn_4-6-month-old stage (rat)


In [24]:
df.shape

(139136, 2)

# Create Binary Matrix

In [25]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  15799 Out of 15799   

In [26]:
binary_matrix.head()

Unnamed: 0,skeletal muscle tissue_post-juvenile adult stage,spleen_post-juvenile adult stage,liver_post-juvenile adult stage,testis_post-juvenile adult stage,Ammon's horn_post-juvenile adult stage,brain_post-juvenile adult stage,colon_post-juvenile adult stage,lung_post-juvenile adult stage,Ammon's horn_4-6-month-old stage (rat),heart_post-juvenile adult stage,adult mammalian kidney_post-juvenile adult stage
CD177,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
HTR7,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
RSL24D1,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
CKS2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
UBE2R2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
binary_matrix.shape

(15799, 11)

# Save Binary Matrix

In [28]:
filename = path+'bgee_rat_sample_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [29]:
name = 'bgee_rat_sample_gene_set'

In [30]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  11 Out of 11   

# Create Attribute Library

In [31]:
name = 'bgee_rat_sample_attribute_set'

In [32]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  15799 Out of 15799   

# Create Gene List

In [33]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  15799 Out of 15799   

In [34]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,CD177,57126
1,HTR7,3363
2,RSL24D1,51187
3,CKS2,1164
4,UBE2R2,54926


In [35]:
gene_list.shape

(15799, 2)

# Save Gene List

In [36]:
filename = path+'bgee_rat_sample_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [37]:
attribute_list = uf.createAttributeList(binary_matrix, metaData=meta_data)

Progeres: 100%  11 Out of 11   

In [38]:
attribute_list.head()

Unnamed: 0_level_0,Anatomical entity name,Developmental stage name
Attributes,Unnamed: 1_level_1,Unnamed: 2_level_1
skeletal muscle tissue_post-juvenile adult stage,skeletal muscle tissue,post-juvenile adult stage
spleen_post-juvenile adult stage,spleen,post-juvenile adult stage
liver_post-juvenile adult stage,liver,post-juvenile adult stage
testis_post-juvenile adult stage,testis,post-juvenile adult stage
Ammon's horn_post-juvenile adult stage,Ammon's horn,post-juvenile adult stage


In [39]:
attribute_list.shape

(11, 2)

# Save Attribute List

In [40]:
filename = path+'bgee_rat_sample_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute Similarity matrix

In [41]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [42]:
attribute_similarity_matix.head()

Unnamed: 0,skeletal muscle tissue_post-juvenile adult stage,spleen_post-juvenile adult stage,liver_post-juvenile adult stage,testis_post-juvenile adult stage,Ammon's horn_post-juvenile adult stage,brain_post-juvenile adult stage,colon_post-juvenile adult stage,lung_post-juvenile adult stage,Ammon's horn_4-6-month-old stage (rat),heart_post-juvenile adult stage,adult mammalian kidney_post-juvenile adult stage
,,,,,,,,,,,
skeletal muscle tissue_post-juvenile adult stage,1.0,0.888843,0.871915,0.868675,0.463616,0.885579,0.893073,0.896969,0.437023,0.913377,0.898518
spleen_post-juvenile adult stage,0.888843,1.0,0.897345,0.85446,0.465066,0.871564,0.905771,0.910662,0.438288,0.902607,0.896272
liver_post-juvenile adult stage,0.871915,0.897345,1.0,0.844225,0.467595,0.858542,0.884728,0.890806,0.441196,0.888312,0.894913
testis_post-juvenile adult stage,0.868675,0.85446,0.844225,1.0,0.451111,0.882017,0.872455,0.873434,0.426275,0.857143,0.878317
Ammon's horn_post-juvenile adult stage,0.463616,0.465066,0.467595,0.451111,1.0,0.453462,0.455767,0.455819,0.940448,0.468741,0.449932


In [43]:
attribute_similarity_matix.shape

(11, 11)

# Save Attribute Similarity Matrix

In [44]:
filename = path+'bgee_rat_sample_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Similarity Matrix

In [45]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [46]:
gene_similarity_matix.head()

Unnamed: 0,CD177,HTR7,RSL24D1,CKS2,UBE2R2,INCA1,SOS2,TTC23,UQCRQ,HIST3H2A,...,MFRP,UMODL1,DYNLL2,BCL7A,MINDY1,POLG,OXLD1,MICAL1,MCMDC2,PLA2R1
,,,,,,,,,,,,,,,,,,,,,
CD177,1.0,0.285714,0.444444,0.4,0.363636,0.444444,0.363636,0.444444,0.363636,0.285714,...,0.571429,0.0,0.363636,0.444444,0.363636,0.444444,0.444444,0.444444,0.444444,0.444444
HTR7,0.285714,1.0,0.555556,0.5,0.454545,0.555556,0.454545,0.555556,0.454545,0.25,...,0.714286,0.4,0.454545,0.555556,0.454545,0.555556,0.555556,0.555556,0.555556,0.555556
RSL24D1,0.444444,0.555556,1.0,0.9,0.818182,1.0,0.818182,1.0,0.818182,0.555556,...,0.777778,0.222222,0.818182,1.0,0.818182,1.0,1.0,1.0,1.0,1.0
CKS2,0.4,0.5,0.9,1.0,0.909091,0.9,0.909091,0.9,0.909091,0.5,...,0.7,0.2,0.909091,0.9,0.909091,0.9,0.9,0.9,0.9,0.9
UBE2R2,0.363636,0.454545,0.818182,0.909091,1.0,0.818182,1.0,0.818182,1.0,0.454545,...,0.636364,0.181818,1.0,0.818182,1.0,0.818182,0.818182,0.818182,0.818182,0.818182


In [47]:
gene_similarity_matix.shape

(15799, 15799)

# Save Gene Similarity Matrix

In [48]:
filename = path+'bgee_rat_sample_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene-Attribute Edge List

In [49]:
name = 'bgee_rat_sample_gene_attribute_edge_list'

In [50]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  11 Out of 11   

 The number of statisticaly relevent gene-attribute associations is: 139136
