# Harmonizome ETL: BrainSpan - Atlas of the Developing Human Brain

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source: http://www.brainspan.org/static/download.html

In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import sys
import os
import zipfile
from datetime import date

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

# Initialization

### Set Data Grouping Options

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

{% do SectionField(
    name='settings',
    title='Settings',
    img='setting_icon.png'
) %}

In [None]:
%%appyter hide_code

{% do DescriptionField(
    name='description',
    text='The following examples were sourced from <a href="http://www.brainspan.org/static/download.html" target="_blank">www.brainspan.org</a>. The first file field has examples for developmental brain RNA-Seq data, developmental brain microarray data, and prenatal brain microarray data, respectively. The other fields only have examples for prenatal brain microarray data. If clicking on the examples does not work, they should be downloaded directly from the source website. At least one file must be provided.',
    section='data'
)%}

In [None]:
%%appyter code_eval

{% set file_1 = FileField(
    constraint='(.*\.zip$)?', name='dataset1', label='File 1', 
    default='Input/BrainSpan/genes_matrix_csv.zip',
    section='data',
    examples={
        'genes_matrix_csv.zip (Developmental Brain RNA-Seq)': 'http://www.brainspan.org/api/v2/well_known_file_download/267666525',
        'genes_array_matrix_csv.zip (Developmental Brain Microarray)': 'http://www.brainspan.org/api/v2/well_known_file_download/267666527',
        'lmd_matrix_12840.zip (Prenatal Brain Microarray)': 'http://www.brainspan.org/api/v2/well_known_file_download/278442900'
    }) 
%}

{% set file_2 = FileField(
    constraint='(.*\.zip$)?', name='dataset2', label='File 2', 
    default='',
    section='data',
    examples={
        'lmd_matrix_14751.zip (Prenatal Brain Microarray)': 'http://www.brainspan.org/api/v2/well_known_file_download/278444085'
    })
%}

{% set file_3 = FileField(
    constraint='(.*\.zip$)?', name='dataset3', label='File 3', 
    default='',
    section='data',
    examples={
        'lmd_matrix_12566.zip (Prenatal Brain Microarray)': 'http://www.brainspan.org/api/v2/well_known_file_download/278444090'
    })
%}

{% set file_4 = FileField(constraint='(.*\.zip$)?', name='dataset4', label='File 4', 
    default='',
    section='data',
    examples={
        'lmd_matrix_12690.zip (Prenatal Brain Microarray)': 'http://www.brainspan.org/api/v2/well_known_file_download/278444094'
    })
%}

In [None]:
%%appyter code_eval

{% set dataset = ChoiceField(
    name='dataset',
    label='Dataset',
    choices={
        'Developmental Brain (Microarray)': 'DMA',
        'Developmental Brain (RNA-Seq)': 'DRS', 
        'Prenatal Brain (Microarray)': 'PMA'
    },
    default='Developmental Brain (Microarray)',
    section='settings'
) %}

{% set attribute = ChoiceField(
    name='attribute',
    label='Attribute',
    description='The prenatal microarray dataset cannot take the age attribute. All other combinations of dataset and attribute work.',
    choices=['Age', 'Sample', 'Tissue'],
    default='Sample',
    section='settings'
) %}

### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Output Path

In [None]:
%%appyter code_exec

output_name = 'brainspan_{{dataset}}'.lower()

path = 'Output/BrainSpain-{{dataset}}'
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

In [None]:
%%appyter code_exec

file_list = [
    {{file_1}}, 
    {{file_2}}, 
    {{file_3}}, 
    {{file_4}}
]

In [None]:
data = []

for f in file_list:
    if not f == '':
        print('Loading file:', f)
        with zipfile.ZipFile(f) as zipf:
            with zipf.open('expression_matrix.csv') as matrix_file:
                matrix = pd.read_csv(matrix_file, header=None, index_col=0)
            with zipf.open('columns_metadata.csv') as sample_file:
                sample_meta = pd.read_csv(sample_file, index_col=0)
            with zipf.open('rows_metadata.csv') as gene_file:
                gene_meta = pd.read_csv(gene_file, index_col=0)
        data.append((matrix, sample_meta, gene_meta))

# Pre-process Data

## Map Genes and Attribute to Matrix

In [None]:
%%appyter code_exec

matrices = []

for matrix, sample_meta, gene_meta in data:
    matrix.index = gene_meta['gene_symbol']
    if '{{attribute}}' == 'Age':
        matrix.columns = sample_meta['age']
    elif '{{attribute}}' == 'Tissue':
        matrix.columns = sample_meta['structure_name']
    elif '{{attribute}}' == 'Sample':
        sample_meta = sample_meta.set_index(['donor_id', 'structure_id'])
        matrix.columns = sample_meta.index
    matrices.append(matrix)
matrix = pd.concat(matrices, axis=1)

In [None]:
%%appyter code_exec

matrix.index.name = 'Gene Symbol'
matrix.columns.name = '{{attribute}}'

In [None]:
matrix.head()

In [None]:
matrix.shape

## Save Unfiltered Matrix to file

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_unfiltered',
            compression='gzip', dtype=np.float32)

# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
matrix = uf.map_symbols(matrix, symbol_lookup)
matrix.shape

## Merge Duplicate Genes By Rows and Duplicate Columns

In [None]:
matrix = uf.merge(matrix, 'row')
matrix = uf.merge(matrix, 'column')
matrix.shape

## Remove Data that is More Than 95% Missing and Impute Missing Data

In [None]:
matrix = uf.remove_impute(matrix)
matrix.head()

In [None]:
matrix.shape

## Log2 Transform

In [None]:
matrix = uf.log2(matrix)
matrix.head()

## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantile_normalize(matrix)
matrix.head()

## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix)
matrix.head()

## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

## Save Filtered Matrix

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

# Analyze Data

## Create Gene List

In [None]:
gene_list = uf.gene_list(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
%%appyter code_exec

if '{{attribute}}' == 'Sample':
    attribute_list = uf.attribute_list(matrix, sample_meta)
else:
    attribute_list = uf.attribute_list(matrix)

In [None]:
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.standardized_matrix(matrix)
standard_matrix.head()

In [None]:
uf.save_data(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

## Create Ternary Matrix

In [None]:
ternary_matrix = uf.ternary_matrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.save_data(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'down', path, output_name + '_gene_down_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'down', path, 
                             output_name + '_attribute_down_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(standard_matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(standard_matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(standard_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)