---
**Bgee**

---

Files documentation: https://bgee.org/?page=doc&action=call_files#single

In [1]:
import os
import pandas as pd
import yaml
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import shutil
import urllib.request as request
from contextlib import closing
import gzip

from toolbox import *

%matplotlib inline
sns.set()
sns.set_context("notebook")
sns.set(rc={'figure.figsize':(14,6)})

In [3]:
cfg = load_cfg()

logVersions = load_LogVersions()

# Download data

Using FTP

In [13]:
versionBgee = '14-2'

In [14]:
foo = 'Homo_sapiens_expr_simple_development_v{}.tsv.gz'.format(versionBgee)
bar = 'Homo_sapiens_expr_simple_development_v{}.tsv'.format(versionBgee)

# Download

with closing(request.urlopen('ftp://ftp.bgee.org/current/download/calls/expr_calls/Homo_sapiens_expr_simple_development.tsv.gz')) as r:
    with open(os.path.join(cfg['rawDataBgee'], foo), 'wb') as f:
        shutil.copyfileobj(r, f)
        
# Unzip 

with gzip.open(os.path.join(cfg['rawDataBgee'], foo), 'rb') as f_in:
    with open(os.path.join(cfg['rawDataBgee'], bar), 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [15]:
# logVersions['Bgee'] = dict()
logVersions['Bgee']['rawData']=versionBgee

dump_LogVersions(logVersions)

In [17]:
dfBgee = pd.read_csv(os.path.join(cfg['rawDataBgee'],
                                 'Homo_sapiens_expr_simple_development_v{}.tsv'.format(logVersions['Bgee']['rawData'])), 
                     sep="\t", low_memory=False)

glance(dfBgee)

DataFrame: 38,541,088 rows 	 9 columns


Unnamed: 0,Gene ID,Gene name,Anatomical entity ID,Anatomical entity name,Developmental stage ID,Developmental stage name,Expression,Call quality,Expression rank
0,ENSG00000000003,TSPAN6,CL:0000015,male germ cell,HsapDv:0000092,human middle aged stage (human),present,silver quality,6020.0
1,ENSG00000000003,TSPAN6,CL:0000019,sperm,HsapDv:0000088,human early adulthood stage (human),present,silver quality,23.3
2,ENSG00000000003,TSPAN6,CL:0000023,oocyte,HsapDv:0000087,human adult stage (human),absent,silver quality,20700.0
3,ENSG00000000003,TSPAN6,CL:0000083,epithelial cell of pancreas,UBERON:0000104,life cycle,present,silver quality,6130.0
4,ENSG00000000003,TSPAN6,CL:0000094,granulocyte,HsapDv:0000090,25-44 year-old human stage (human),absent,silver quality,24000.0


## EDA

In [18]:
print("> Number of genes: {:,}".format(len(list(set(dfBgee['Gene ID'])))))
print("> Number of anatomical entities: {:,}".format(len(set(list(dfBgee['Anatomical entity ID'])))))
print("> Number of developmental stages: {:,}".format(len(set(list(dfBgee['Developmental stage ID'])))))

> Number of genes: 59,777
> Number of anatomical entities: 320
> Number of developmental stages: 33


In [19]:
dfBgee.loc[:,'Call quality'].value_counts()

silver quality    30787752
gold quality       7753336
Name: Call quality, dtype: int64

In [20]:
dfBgee.loc[:,'Expression'].value_counts()

present    22750808
absent     15790280
Name: Expression, dtype: int64

In [21]:
# Some sanity checks
assert len(set(list(dfBgee['Anatomical entity ID']))) == len(set(list(dfBgee['Anatomical entity name'])))
assert len(set(list(dfBgee['Developmental stage ID']))) == len(set(list(dfBgee['Developmental stage name'])))
assert ~any(dfBgee[['Gene ID', 'Anatomical entity name', 'Developmental stage name']].duplicated())

assert set(dfBgee.Expression) == set(['absent', 'present'])
assert set(dfBgee.loc[:,'Call quality']) == set(['silver quality', 'gold quality'])

# Preprocess data


**Create expression variable**

In [24]:
def mappingExpression(x):
    if x == 'present':
        return 1
    else:
        return -1
    
def mappingQuality(x):
    if x == 'silver quality':
        return 1
    else:
        return 2

foo = dfBgee.Expression.apply(mappingExpression)
bar = dfBgee['Call quality'].apply(mappingQuality)

dfBgee['ExpressionQuant'] = foo * bar

glance(dfBgee, n=20)

DataFrame: 38,541,088 rows 	 10 columns


Unnamed: 0,Gene ID,Gene name,Anatomical entity ID,Anatomical entity name,Developmental stage ID,Developmental stage name,Expression,Call quality,Expression rank,ExpressionQuant
0,ENSG00000000003,TSPAN6,CL:0000015,male germ cell,HsapDv:0000092,human middle aged stage (human),present,silver quality,6020.0,1
1,ENSG00000000003,TSPAN6,CL:0000019,sperm,HsapDv:0000088,human early adulthood stage (human),present,silver quality,23.3,1
2,ENSG00000000003,TSPAN6,CL:0000023,oocyte,HsapDv:0000087,human adult stage (human),absent,silver quality,20700.0,-1
3,ENSG00000000003,TSPAN6,CL:0000083,epithelial cell of pancreas,UBERON:0000104,life cycle,present,silver quality,6130.0,1
4,ENSG00000000003,TSPAN6,CL:0000094,granulocyte,HsapDv:0000090,25-44 year-old human stage (human),absent,silver quality,24000.0,-1
5,ENSG00000000003,TSPAN6,CL:0000115,endothelial cell,HsapDv:0000092,human middle aged stage (human),present,silver quality,12000.0,1
6,ENSG00000000003,TSPAN6,CL:0000169,type B pancreatic cell,HsapDv:0000087,human adult stage (human),absent,silver quality,20400.0,-1
7,ENSG00000000003,TSPAN6,CL:0000576,monocyte,UBERON:0000113,post-juvenile adult stage,present,silver quality,28500.0,1
8,ENSG00000000003,TSPAN6,CL:0000655,secondary oocyte,UBERON:0000104,life cycle,absent,silver quality,18200.0,-1
9,ENSG00000000003,TSPAN6,CL:0000738,leukocyte,HsapDv:0000092,human middle aged stage (human),present,silver quality,18600.0,1


---
**Pivot table**

In [25]:
BgeePivot = pd.pivot_table(data=dfBgee,
                           index='Gene ID',
                           columns=['Anatomical entity name',
                                    'Developmental stage name'],
                           values='ExpressionQuant')

glance(BgeePivot)

# BgeePivot_2 = BgeePivot.fillna(0)
BgeePivot.columns = [' '.join(col).strip() for col in BgeePivot.columns.values]

glance(BgeePivot)

DataFrame: 59,777 rows 	 1,147 columns


Anatomical entity name,Ammon's horn,Ammon's horn,Ammon's horn,Ammon's horn,Ammon's horn,Ammon's horn,Ammon's horn,Ammon's horn,Ammon's horn,Ammon's horn,...,visceral pleura,visceral pleura,zone of skin,zone of skin,zone of skin,zone of skin,zone of skin,zone of skin,zone of skin,zone of skin
Developmental stage name,2-5 year-old child stage (human),25-44 year-old human stage (human),65-79 year-old human stage (human),80 year-old and over human stage (human),adolescent stage (human),human adult stage (human),human aged stage,human late adulthood stage (human),human middle aged stage (human),life cycle,...,human middle aged stage (human),young adult stage (human),25-44 year-old human stage (human),adolescent stage (human),human adult stage (human),human middle aged stage (human),late embryonic stage,life cycle,post-juvenile adult stage,young adult stage (human)
Gene ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
ENSG00000000003,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,...,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0
ENSG00000000005,,-2.0,-2.0,-2.0,-1.0,1.0,,1.0,1.0,1.0,...,-1.0,-1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0
ENSG00000000419,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,...,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0
ENSG00000000457,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,...,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0
ENSG00000000460,,1.0,1.0,,-1.0,2.0,,2.0,2.0,2.0,...,-1.0,-1.0,1.0,1.0,2.0,2.0,,2.0,2.0,1.0


DataFrame: 59,777 rows 	 1,147 columns


Unnamed: 0_level_0,Ammon's horn 2-5 year-old child stage (human),Ammon's horn 25-44 year-old human stage (human),Ammon's horn 65-79 year-old human stage (human),Ammon's horn 80 year-old and over human stage (human),Ammon's horn adolescent stage (human),Ammon's horn human adult stage (human),Ammon's horn human aged stage,Ammon's horn human late adulthood stage (human),Ammon's horn human middle aged stage (human),Ammon's horn life cycle,...,visceral pleura human middle aged stage (human),visceral pleura young adult stage (human),zone of skin 25-44 year-old human stage (human),zone of skin adolescent stage (human),zone of skin human adult stage (human),zone of skin human middle aged stage (human),zone of skin late embryonic stage,zone of skin life cycle,zone of skin post-juvenile adult stage,zone of skin young adult stage (human)
Gene ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,...,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0
ENSG00000000005,,-2.0,-2.0,-2.0,-1.0,1.0,,1.0,1.0,1.0,...,-1.0,-1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0
ENSG00000000419,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,...,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0
ENSG00000000457,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,...,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0
ENSG00000000460,,1.0,1.0,,-1.0,2.0,,2.0,2.0,2.0,...,-1.0,-1.0,1.0,1.0,2.0,2.0,,2.0,2.0,1.0


___
**Match Ensembl IDs with UniProt IDs**

In [26]:
uniprotBgeeMatching = pd.read_pickle(
    os.path.join(cfg['rawDataUniProt'], 
                 "uniprot_allProteinsBgee_Human_v{}.pkl".format(logVersions['UniProt']['rawData'])))
glance(uniprotBgeeMatching)
uniprotBgeeMatching.columns = ['uniprotID','Gene ID']

glance(uniprotBgeeMatching)

DataFrame: 20,386 rows 	 2 columns


Unnamed: 0,Entry,Cross-reference (bgee)
0,A0A024RBG1,ENSG00000173598
1,A0A075B6H7,ENSG00000243063
2,A0A075B6H8,ENSG00000211633
3,A0A075B6H9,ENSG00000211637
4,A0A075B6I0,ENSG00000211638


DataFrame: 20,386 rows 	 2 columns


Unnamed: 0,uniprotID,Gene ID
0,A0A024RBG1,ENSG00000173598
1,A0A075B6H7,ENSG00000243063
2,A0A075B6H8,ENSG00000211633
3,A0A075B6H9,ENSG00000211637
4,A0A075B6I0,ENSG00000211638


In [27]:
print("Number of missing genes IDs: {:,}/{:,}".format(uniprotBgeeMatching['Gene ID'].isna().sum(), len(uniprotBgeeMatching)))

Number of missing genes IDs: 1,296/20,386


In [28]:
BgeePivot_2 = BgeePivot.merge(uniprotBgeeMatching, how="inner", on = "Gene ID")

glance(BgeePivot_2)

DataFrame: 19,090 rows 	 1,149 columns


Unnamed: 0,Gene ID,Ammon's horn 2-5 year-old child stage (human),Ammon's horn 25-44 year-old human stage (human),Ammon's horn 65-79 year-old human stage (human),Ammon's horn 80 year-old and over human stage (human),Ammon's horn adolescent stage (human),Ammon's horn human adult stage (human),Ammon's horn human aged stage,Ammon's horn human late adulthood stage (human),Ammon's horn human middle aged stage (human),...,visceral pleura young adult stage (human),zone of skin 25-44 year-old human stage (human),zone of skin adolescent stage (human),zone of skin human adult stage (human),zone of skin human middle aged stage (human),zone of skin late embryonic stage,zone of skin life cycle,zone of skin post-juvenile adult stage,zone of skin young adult stage (human),uniprotID
0,ENSG00000000003,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,O43657
1,ENSG00000000005,,-2.0,-2.0,-2.0,-1.0,1.0,,1.0,1.0,...,-1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,Q9H2S6
2,ENSG00000000419,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,O60762
3,ENSG00000000457,,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,Q8IZE3
4,ENSG00000000460,,1.0,1.0,,-1.0,2.0,,2.0,2.0,...,-1.0,1.0,1.0,2.0,2.0,,2.0,2.0,1.0,Q9NSG2


In [29]:
# Sanity check
assert BgeePivot_2.uniprotID.isna().sum() == 0
assert len(set(BgeePivot_2.uniprotID)) == len(BgeePivot_2)

---
**Fill missing values with 0**

In [30]:
BgeePivot_3 = BgeePivot_2.fillna(0)
glance(BgeePivot_3)

DataFrame: 19,090 rows 	 1,149 columns


Unnamed: 0,Gene ID,Ammon's horn 2-5 year-old child stage (human),Ammon's horn 25-44 year-old human stage (human),Ammon's horn 65-79 year-old human stage (human),Ammon's horn 80 year-old and over human stage (human),Ammon's horn adolescent stage (human),Ammon's horn human adult stage (human),Ammon's horn human aged stage,Ammon's horn human late adulthood stage (human),Ammon's horn human middle aged stage (human),...,visceral pleura young adult stage (human),zone of skin 25-44 year-old human stage (human),zone of skin adolescent stage (human),zone of skin human adult stage (human),zone of skin human middle aged stage (human),zone of skin late embryonic stage,zone of skin life cycle,zone of skin post-juvenile adult stage,zone of skin young adult stage (human),uniprotID
0,ENSG00000000003,0.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,O43657
1,ENSG00000000005,0.0,-2.0,-2.0,-2.0,-1.0,1.0,0.0,1.0,1.0,...,-1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,Q9H2S6
2,ENSG00000000419,0.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,O60762
3,ENSG00000000457,0.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,Q8IZE3
4,ENSG00000000460,0.0,1.0,1.0,0.0,-1.0,2.0,0.0,2.0,2.0,...,-1.0,1.0,1.0,2.0,2.0,0.0,2.0,2.0,1.0,Q9NSG2


---
**Remove gene ID**

In [31]:
BgeePivot_4 = BgeePivot_3.drop(['Gene ID'], axis=1)
glance(BgeePivot_4)

DataFrame: 19,090 rows 	 1,148 columns


Unnamed: 0,Ammon's horn 2-5 year-old child stage (human),Ammon's horn 25-44 year-old human stage (human),Ammon's horn 65-79 year-old human stage (human),Ammon's horn 80 year-old and over human stage (human),Ammon's horn adolescent stage (human),Ammon's horn human adult stage (human),Ammon's horn human aged stage,Ammon's horn human late adulthood stage (human),Ammon's horn human middle aged stage (human),Ammon's horn life cycle,...,visceral pleura young adult stage (human),zone of skin 25-44 year-old human stage (human),zone of skin adolescent stage (human),zone of skin human adult stage (human),zone of skin human middle aged stage (human),zone of skin late embryonic stage,zone of skin life cycle,zone of skin post-juvenile adult stage,zone of skin young adult stage (human),uniprotID
0,0.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,O43657
1,0.0,-2.0,-2.0,-2.0,-1.0,1.0,0.0,1.0,1.0,1.0,...,-1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,Q9H2S6
2,0.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,O60762
3,0.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,Q8IZE3
4,0.0,1.0,1.0,0.0,-1.0,2.0,0.0,2.0,2.0,2.0,...,-1.0,1.0,1.0,2.0,2.0,0.0,2.0,2.0,1.0,Q9NSG2


---
**Export**
- v2.1 = quantitative expression, using Bgee v14.2

In [32]:
BgeePivot_export = BgeePivot_4

In [33]:
myVersionBgee = '2-1'

logVersions['Bgee']['preprocessed']=myVersionBgee

dump_LogVersions(logVersions)

In [34]:
BgeePivot_export.to_pickle(os.path.join(cfg['outputPreprocessingBgee'],
                                   "Bgee_processed_v{}.pkl".format(myVersionBgee)))