 # Allen Brain Atlas Developing Human Brain

 Author: Moshe Silverstein <br/>
 Date: 3-18 <br/>
 Data Source: http://www.brainspan.org/static/download.html

 Reviewer: Charles Dai <br>
 Updated: 6-20

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome-helper.utility_functions as uf
import harmonizome-helper.lookup as lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

 ### Python Version

In [5]:
sys.version

'3.8.0 (default, Oct 28 2019, 16:14:01) \n[GCC 8.3.0]'

 # Initialization

### Set Data Grouping Options

In [25]:
%%appyter code_eval
{% set data_type = ChoiceField(
    name='data_type',
    label='Data Type',
    choices={
        'RNA-Seq': 'RS', 
        'Microarray': 'MA',
    },
    default='RNA-Seq',
    section='data'
) %}

{% set data_grouping = ChoiceField(
    name='data_grouping',
    label='Data Grouped by',
    choices={
        'Age': "'age'", 
        'Sample': "'sample'", 
        'Tissue': "'tissue'"
    },
    default='Age',
    section='data'
) %}

In [7]:
%%appyter code_exec

options = {
    'age': {
        'output_name': 'aba_dhb_{{data_type}}_age',
        'path': 'ABA-DHB-{{data_type}}-A',
        'grouping': lambda mat: mat['age']
    },
    'sample': {
        'output_name': 'aba_dhb_{{data_type}}_samp',
        'path': 'ABA-DHB-{{data_type}}-S',
        'grouping': 
            lambda mat: mat.apply(
                lambda row: '_'.join([
                    row['structure_name'], 
                    str(row['age']), 
                    row['gender'], 
                    str(row['donor_id'])
                ]), axis=1)
    },
    'tissue': {
        'output_name': 'aba_dhb_{{data_type}}_tissue',
        'path': 'ABA-DHB-{{data_type}}-T',
        'grouping': lambda mat: mat['tissue']

    }
}

```python
options = {
    'age': {
        'output_name': 'aba_dhb_RS_age',
        'path': 'ABA-DHB-RS-A',
        'grouping': lambda mat: mat['age']
    },
    'sample': {
        'output_name': 'aba_dhb_RS_samp',
        'path': 'ABA-DHB-RS-S',
        'grouping':
            lambda mat: mat.apply(
                lambda row: '_'.join([
                    row['structure_name'],
                    str(row['age']),
                    row['gender'],
                    str(row['donor_id'])
                ]), axis=1)
    },
    'tissue': {
        'output_name': 'aba_dhb_RS_tissue',
        'path': 'ABA-DHB-RS-T',
        'grouping': lambda mat: mat['tissue']
    }
}
```

 ### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

 ### Output Path

In [36]:
%%appyter code_exec

output_name = options[{{data_grouping}}]['output_name']

path = options[{{data_grouping}}]['path']
if not os.path.exists(path):
    os.makedirs(path)

```python
output_name = options['age']['output_name']
path = options['age']['path']
if not os.path.exists(path):
    os.makedirs(path)
```

In [None]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from Developing Human Brain Data Sets',
) %}

 # Load Data

In [24]:
%%appyter code_exec

matrix = pd.read_csv({{FileField(
    constraint='.*\.csv$',
    name='expression_matrix', 
    label='Expression Matrix', 
    default='Input/ABA-DHB-MA/gene_array_matrix_csv/expression_matrix.csv',
    section='data')
}}, header=None, index_col=0)

```python

matrix = pd.read_csv('Input/ABA-DHB-MA/gene_array_matrix_csv/expression_matrix.csv', header=None, index_col=0)
```

In [38]:
matrix.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,483,484,485,486,487,488,489,490,491,492
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,9.16596,9.35303,9.12842,9.28837,9.1717,9.52198,9.2666,9.31059,9.16732,9.125,...,9.07099,8.7787,8.85986,8.34032,9.35445,8.7037,9.64959,8.52157,9.22952,9.64878
2,8.40401,8.43084,8.87691,8.81414,8.8316,8.56038,8.42721,8.85843,8.54096,8.34537,...,8.3476,8.45158,8.50765,8.19684,8.14229,8.54619,8.15847,8.77172,8.64942,8.18061
3,9.35942,9.58458,9.76375,9.53778,9.61363,9.37657,9.55275,9.83603,9.5596,9.36183,...,8.43173,8.54623,9.02475,8.21754,8.91127,8.77168,9.11248,8.40019,9.05908,8.82711
4,6.30357,6.73438,6.50582,7.24431,6.62531,6.96386,6.74376,6.67833,7.06941,5.86169,...,6.82257,7.54365,7.09902,6.71188,7.16314,7.18352,7.36694,6.70071,7.54916,6.78094
5,4.57404,5.22911,4.66054,5.03038,5.02629,4.77163,4.89984,5.13641,5.27419,4.73643,...,4.53204,4.72392,4.04599,5.12808,4.59776,4.47878,4.37817,4.59131,4.96854,4.35579


In [39]:
matrix.shape

(17604, 492)

 ## Load Sample Metadata

In [40]:
%%appyter code_exec

sample_meta = pd.read_csv({{FileField(
    constraint='.*\.csv$',
    name='columns_metadata', 
    label='Sample Metadata', 
    default='Input/ABA-DHB-{{data_type}}/gene_array_matrix_csv/columns_metadata.csv',
    section='data')
}}, index_col=0)

```python

sample_meta = pd.read_csv('Input/ABA-DHB-MA-S/gene_array_matrix_csv/columns_metadata.csv', index_col=0)
```

In [41]:
sample_meta.head()

Unnamed: 0_level_0,donor_id,donor_name,age,gender,structure_id,structure_acronym,structure_name
column_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,13058,H376.IIA.51,8 pcw,M,10361,AMY,amygdaloid complex
2,13058,H376.IIA.51,8 pcw,M,10552,CGE,caudal ganglionic eminence
3,13058,H376.IIA.51,8 pcw,M,10173,DFC,dorsolateral prefrontal cortex
4,13058,H376.IIA.51,8 pcw,M,10391,DTH,dorsal thalamus
5,13058,H376.IIA.51,8 pcw,M,10294,HIP,hippocampus (hippocampal formation)


In [42]:
sample_meta.shape

(492, 7)

 ## Load Gene Metadata

In [43]:
%%appyter code_exec

gene_meta = pd.read_csv({{FileField(
    constraint='.*\.csv$',
    name='rows_metadata', 
    label='Gene Metadata', 
    default='Input/ABA-DHB-{{data_type}}/gene_array_matrix_csv/rows_metadata.csv',
    section='data')
}}, index_col=0)

```python

gene_meta = pd.read_csv('Input/ABA-DHB-MA-S/gene_array_matrix_csv/rows_metadata.csv', index_col=0)
```

In [44]:
gene_meta.head()

Unnamed: 0_level_0,gene_id,ensembl_gene_id,gene_symbol,entrez_id
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,9633,ENSG00000101337,TM9SF4,9777
2,22692,ENSG00000114650,SCAP,22937
3,22952,ENSG00000113194,FAF2,23197
4,34497,ENSG00000150401,DCUN1D2,55208
5,35224,ENSG00000204444,APOM,55937


In [45]:
gene_meta.shape

(17604, 4)

 # Pre-process Data

 ## Map Sample Metadata to Sample ID

In [46]:
%%appyter code_exec

map_func = options[{{data_grouping}}]['grouping']
matrix.columns = map_func(sample_meta)
matrix.head()

```python
map_func = options['age']['grouping']
matrix.columns = map_func(sample_meta)
matrix.head()
```

 ## Map Gene to Row

In [47]:
matrix.index = gene_meta['gene_symbol']
matrix.index.name = None
matrix.head()

age,8 pcw,8 pcw.1,8 pcw.2,8 pcw.3,8 pcw.4,8 pcw.5,8 pcw.6,8 pcw.7,8 pcw.8,8 pcw.9,...,40 yrs,40 yrs.1,40 yrs.2,40 yrs.3,40 yrs.4,40 yrs.5,40 yrs.6,40 yrs.7,40 yrs.8,40 yrs.9
TM9SF4,9.16596,9.35303,9.12842,9.28837,9.1717,9.52198,9.2666,9.31059,9.16732,9.125,...,9.07099,8.7787,8.85986,8.34032,9.35445,8.7037,9.64959,8.52157,9.22952,9.64878
SCAP,8.40401,8.43084,8.87691,8.81414,8.8316,8.56038,8.42721,8.85843,8.54096,8.34537,...,8.3476,8.45158,8.50765,8.19684,8.14229,8.54619,8.15847,8.77172,8.64942,8.18061
FAF2,9.35942,9.58458,9.76375,9.53778,9.61363,9.37657,9.55275,9.83603,9.5596,9.36183,...,8.43173,8.54623,9.02475,8.21754,8.91127,8.77168,9.11248,8.40019,9.05908,8.82711
DCUN1D2,6.30357,6.73438,6.50582,7.24431,6.62531,6.96386,6.74376,6.67833,7.06941,5.86169,...,6.82257,7.54365,7.09902,6.71188,7.16314,7.18352,7.36694,6.70071,7.54916,6.78094
APOM,4.57404,5.22911,4.66054,5.03038,5.02629,4.77163,4.89984,5.13641,5.27419,4.73643,...,4.53204,4.72392,4.04599,5.12808,4.59776,4.47878,4.37817,4.59131,4.96854,4.35579


In [48]:
matrix.shape

(17604, 492)

 ## Save Unfiltered Matrix to file

In [None]:
uf.saveData(matrix, path, output_name + '_matrix_unfiltered',
            compression='gzip', dtype=np.float32)

 # Filter Data

 ## Remove Data that is More Than 95% Missing and Impute Missing Data

In [None]:
matrix = uf.removeAndImpute(matrix)
matrix.head()

In [None]:
matrix.shape

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
matrix = uf.mapgenesymbols(matrix, symbol_lookup)
matrix.shape

 ## Merge Duplicate Genes By Rows and Duplicate Columns

In [None]:
matrix = uf.merge(matrix, 'row', 'mean')
matrix = uf.merge(matrix, 'column', 'mean')
matrix.shape

 ## Log2 Transform

In [None]:
matrix = uf.log2(matrix)
matrix.head()

 ## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantileNormalize(matrix)
matrix.head()

 ## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix, 'row')
matrix.head()

 ## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

 ## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

 ## Save Filtered Matrix

In [None]:
uf.saveData(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

 # Analyze Data

 ## Create Gene List

In [None]:
gene_list = uf.createGeneList(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.saveData(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [None]:
attribute_list = uf.createAttributeList(matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.saveData(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.createStandardizedMatrix(matrix)
standard_matrix.head()

In [None]:
uf.saveData(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

 ## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

 ## Create Ternary Matrix

In [None]:
ternary_matrix = uf.createTernaryMatrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.saveData(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [None]:
uf.createUpGeneSetLib(ternary_matrix, path, output_name + '_gene_up_set')

In [None]:
uf.createDownGeneSetLib(ternary_matrix, path, output_name + '_gene_down_set')

In [None]:
uf.createUpAttributeSetLib(ternary_matrix, path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.createDownAttributeSetLib(ternary_matrix, path, 
                             output_name + '_attribute_down_set')

 ## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.createSimilarityMatrix(matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.saveData(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

 ## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.createSimilarityMatrix(matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.saveData(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [None]:
uf.createGeneAttributeEdgeList(standard_matrix, attribute_list, gene_list, 
                               path, output_name + '_gene_attribute_edge_list')

 # Create Downloadable Save File

In [None]:
uf.createArchive(path)

 ### Link to download output files: [click here](./output_archive.zip)