 # Bgee

 Author: Moshe Silverstein <br>
 Date: 07-18 <br>
 Data Source Home: https://bgee.org/ <br>
 Data Source Download: https://bgee.org/?page=download&action=expr_calls#id1

 Reviewer: Charles Dai <br>
 Updated: 6-20

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

 ### Python Version

In [5]:
sys.version

'3.8.0 (default, Oct 28 2019, 16:14:01) \n[GCC 8.3.0]'

 # Initialization

 ### Choose Attributes

In [56]:
%%appyter code_eval

{% set attributes = MultiChoiceField(
    name='attributes',
    label='Attributes',
    choices=['Anatomical entity', 'Developmental stage'],
    default=['Anatomical entity'],
    section='data'
) %}

In [57]:
%%appyter code_exec

name_dict = {'Anatomical entity': 'Anat', 'Developmental stage': 'Dev'}
output_names = [name_dict[a] for a in {{attributes}}]
headers = [a + ' name' for a in {{attributes}}]

```python
name_dict = {'Anatomical entity': 'Anat', 'Developmental stage': 'Dev'}
output_names = [name_dict[a] for a in ['Anatomical entity', 'Developmental stage']]
headers = [a + ' name' for a in ['Anatomical entity', 'Developmental stage']]
```

### Load Mapping Dictionaries

In [58]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:08<00:00,  2.99s/it]


 ### Output Path

In [59]:
output_name = '_'.join(output_names).lower()

path = 'Output/Bgee-' + '-'.join(output_names)
if not os.path.exists(path):
    os.makedirs(path)

In [60]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the Human Phenotype Ontology Data Set',
) %}

 # Load Data

In [61]:
%%appyter code_exec

df = pd.read_csv({{FileField(
    constraint='.*\.tsv$',
    name='expression', 
    label='Bgee Expression Data', 
    default='Input/Bgee/Rattus_norvegicus_expr_simple_development.tsv',
    section='data')
}}, sep='\t', usecols=['Gene name', 'Expression'] + headers)

```python

df = pd.read_csv('Input/Bgee/Rattus_norvegicus_expr_simple_development.tsv', sep='\t', usecols=['Gene name', 'Expression'] + headers)
```

In [62]:
df.head()

Unnamed: 0,Gene name,Anatomical entity name,Developmental stage name,Expression
0,AABR07013255.1,adult mammalian kidney,post-juvenile adult stage,present
1,AABR07013255.1,testis,life cycle,absent
2,AABR07013255.1,testis,post-juvenile adult stage,absent
3,AABR07013255.1,stomach,post-juvenile adult stage,present
4,AABR07013255.1,heart,post-juvenile adult stage,present


In [63]:
df.shape

(758213, 4)

 # Pre-process Data

## Only Present Expression

In [64]:
df = df[df['Expression'] == 'present']
df = df.drop('Expression', axis=1)
df = df.set_index('Gene name')
df.head()

## Concatenate Attributes

In [69]:
if len(headers) > 1:
    df['Attributes'] = df[headers[0]] + ', ' + df[headers[1]]
    df = df.drop(headers, axis=1)
df.head()

Unnamed: 0_level_0,Attributes
Gene name,Unnamed: 1_level_1
Gad1,"testis, life cycle"
Gad1,"testis, post-juvenile adult stage"
Gad1,"stomach, post-juvenile adult stage"
Gad1,"heart, post-juvenile adult stage"
Gad1,"brain, life cycle"


In [70]:
df.shape

(446088, 1)

 # Filter Data

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [71]:
df = uf.mapgenesymbols(df, symbol_lookup)
df.shape

100%|██████████| 446088/446088 [00:00<00:00, 825952.92it/s]


(446088, 1)

 # Analyze Data

 ## Create Binary Matrix

In [None]:
binary_matrix = uf.createBinaryMatrix(df)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.saveData(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.bool)

 ## Create Gene List

In [None]:
gene_list = uf.createGeneList(binary_matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.saveData(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [None]:
attribute_list = uf.createAttributeList(binary_matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.saveData(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [None]:
uf.createUpGeneSetLib(binary_matrix, path, output_name + '_gene_up_set')

In [None]:
uf.createUpAttributeSetLib(binary_matrix, path, 
                           output_name + '_attribute_up_set')

 ## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.saveData(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

 ## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.createSimilarityMatrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.saveData(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [None]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, 
                               path, output_name + '_gene_attribute_edge_list')

 # Create Downloadable Save File

In [None]:
uf.createArchive(path)

 ### Link to download output files: [click here](./output_archive.zip)