# Harmonizome ETL: Allen Brain Atlas (ABA) - Adult Human Brain

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source: http://human.brain-map.org/static/download

In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import sys
import os
from datetime import date
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

# Initialization

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

{% do SectionField(
    name='settings',
    title='Settings',
    img='setting_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The examples below were sourced from <a href="http://human.brain-map.org/static/download" target="_blank">human.brain-map.org</a>. There are only two donors for RNA-Seq data, and six donors for microarray data. If clicking on the examples does not work, it should be downloaded directly from the source website. At least one file must be provided.',
    section='data'
) %}

{% set file_1 = FileField(
    constraint='(.*\.zip$)?', name='donor1', label='Donor 1', 
    default='Input/ABA-AHB-MA/normalized_microarray_donor9861.zip',
    section='data',
    examples={
        'rnaseq_donor9861.zip (RNA-Seq)': 'http://human.brain-map.org/api/v2/well_known_file_download/278447594',
        'normalized_microarray_donor9861.zip (Microarray)': 'http://human.brain-map.org/api/v2/well_known_file_download/178238387'
    }) 
%}

{% set file_2 = FileField(
    constraint='(.*\.zip$)?', name='donor2', label='Donor 2', 
    default='Input/ABA-AHB-MA/normalized_microarray_donor10021.zip',
    section='data',
    examples={
        'rnaseq_donor10021.zip (RNA-Seq)': 'http://human.brain-map.org/api/v2/well_known_file_download/278448166',
        'normalized_microarray_donor10021.zip (Microarray)': 'http://human.brain-map.org/api/v2/well_known_file_download/178238373'
    })
%}

{% set file_3 = FileField(
    constraint='(.*\.zip$)?', name='donor3', label='Donor 3', 
    default='',
    section='data',
    examples={
        'normalized_microarray_donor12876.zip (Microarray)': 'http://human.brain-map.org/api/v2/well_known_file_download/178238359'
    })
%}

{% set file_4 = FileField(constraint='(.*\.zip$)?', name='donor4', label='Donor 4', 
    default='',
    section='data',
    examples={
        'normalized_microarray_donor14380.zip (Microarray)': 'http://human.brain-map.org/api/v2/well_known_file_download/178238316'
    })
%}

{% set file_5 = FileField(constraint='(.*\.zip$)?', name='donor5', label='Donor 5', 
    default='',
    section='data',
    examples={
        'normalized_microarray_donor15496.zip (Microarray)': 'http://human.brain-map.org/api/v2/well_known_file_download/178238266'
    })
%}

{% set file_6 = FileField(constraint='(.*\.zip$)?', name='donor6', label='Donor 6', 
    default='',
    section='data',
    examples={
        'normalized_microarray_donor15697.zip (Microarray)': 'http://human.brain-map.org/api/v2/well_known_file_download/178236545'
    })
%}

In [None]:
%%appyter code_eval

{% set data_type = ChoiceField(
    name='data_type',
    label='Data Type',
    choices={ 
        'Microarray': 'MA',
        'RNA-Seq': 'RS',
    },
    default='Microarray',
    section='settings'
) %}

In [None]:
%%appyter code_exec

file_names = {
    'RS': ['RNAseqCounts.csv', 'SampleAnnot.csv', 'Ontology.csv'],
    'MA': ['MicroarrayExpression.csv', 'SampleAnnot.csv', 'Probes.csv']
}['{{data_type}}']

### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Output Path

In [None]:
%%appyter code_exec

output_name = 'aba_{{data_type}}'.lower()

path = output_name = 'Output/ABA-{{data_type}}'
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

In [None]:
%%appyter code_exec

file_list = [
    {{file_1}}, 
    {{file_2}}, 
    {{file_3}}, 
    {{file_4}}, 
    {{file_5}}, 
    {{file_6}}
]

In [None]:
data = []

for f in file_list:
    if not f == '':
        print('Loading file:', f)
        with zipfile.ZipFile(f) as zipf:
            with zipf.open(file_names[0]) as matrix_file:
                matrix = pd.read_csv(matrix_file, header=None, index_col=0)
            with zipf.open(file_names[1]) as sample_file:
                sample_meta = pd.read_csv(sample_file, index_col=0)
            with zipf.open(file_names[2]) as meta_file:
                meta = pd.read_csv(meta_file, index_col=0)
        data.append((matrix, sample_meta, meta))

# Pre-process Data

## Map Genes and Tissues to Matrix

In [None]:
%%appyter code_exec

matrices = []

for matrix, sample_meta, meta in data:
    if '{{data_type}}' == 'RS':
        matrix.columns = meta.reindex(sample_meta['ontology_structure_id'])['name']
    elif '{{data_type}}' == 'MA':
        matrix.index = meta.reindex(matrix.index)['gene_symbol']
        matrix.columns = sample_meta['structure_name']
    matrices.append(matrix)
matrix = pd.concat(matrices, axis=1)

In [None]:
matrix = pd.concat(matrices, axis=1)
matrix.index.name = 'Gene Symbol'
matrix.columns.name = 'Tissue Name'
matrix.head()

In [None]:
matrix.shape

## Save Unfiltered Matrix to file

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_unfiltered', compression='npz', dtype=np.float32)

# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
matrix = uf.map_symbols(matrix, symbol_lookup)
matrix.shape

## Merge Duplicate Genes By Rows and Duplicate Columns

In [None]:
matrix = uf.merge(matrix, 'row')
matrix = uf.merge(matrix, 'column')
matrix.shape

## Remove Data that is More Than 95% Missing and Impute Missing Data

In [None]:
matrix = uf.remove_impute(matrix)
matrix.head()

In [None]:
matrix.shape

## Log2 Transform

In [None]:
matrix = uf.log2(matrix)
matrix.head()

## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantile_normalize(matrix)
matrix.head()

## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix)
matrix.head()

## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

## Save Filtered Matrix

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

# Analyze Data

## Create Gene List

In [None]:
gene_list = uf.gene_list(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.standardized_matrix(matrix)
standard_matrix.head()

In [None]:
uf.save_data(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

## Create Ternary Matrix

In [None]:
ternary_matrix = uf.ternary_matrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.save_data(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'down', path, output_name + '_gene_down_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'down', path, 
                             output_name + '_attribute_down_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(standard_matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            ext='tsv', compression='gzip')

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(standard_matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(standard_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)