# Harmonizome ETL: Aging, Dementia and Traumatic Brain Injury Study

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source Home: http://www.brain-map.org/ <br>
Data Source Download: http://aging.brain-map.org/download/index

In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import sys
import os
import zipfile
from datetime import date

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

# Initialization

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

{% do SectionField(
    name='settings',
    title='Settings',
    img='setting_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The examples below were sourced from <a href="http://aging.brain-map.org/download/index" target="_blank">aging.brain-map.org</a>. If clicking on the examples does not work, they should be downloaded directly from the source.',
    section='data'
) %}

{% set m_file = FileField(
    constraint='.*\.zip$',
    name='expression_matrix', 
    label='Expression Matrix', 
    default='Input/ABA-AGING/gene_expression_matrix_2016-03-03.zip',
    examples={
        'gene_expression_matrix_2016-03-03.zip': 'http://aging.brain-map.org/api/v2/well_known_file_download/502999992'
    },
    section='data'
) %}

{% set donor_file = FileField(
    constraint='.*\.csv$',
    name='donor_metadata', 
    label='Donor Metadata', 
    default='Input/ABA-AGING/DonorInformation.csv',
    examples={
        'DonorInformation.csv': 'http://aging.brain-map.org/api/v2/data/query.csv?criteria=model::ApiTbiDonorDetail,rma::options[num_rows$eqall]'
    },
    section='data'
) %}

In [None]:
%%appyter code_eval

{% set cohort = ChoiceField(
    name='cohort',
    label='Cohort',
    description='All: all patients. Dementia/TBI: patients with dementia or traumatic brain injury. No Disease: healthy patients.',
    choices={
        'All': 'All',
        'Dementia/TBI': 'Disease',
        'No Disease': 'Healthy'
    },
    default='All',
    section='settings'
) %}

### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Output Path

In [None]:
%%appyter code_exec

output_name = 'aba_aging_{{cohort}}'.lower()

path = 'Output/ABA-Aging-{{cohort}}'
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

In [None]:
%%appyter code_exec

with zipfile.ZipFile({{m_file}}) as zipf:
    with zipf.open('fpkm_table_normalized.csv') as matrix_file:
        matrix = pd.read_csv(matrix_file, index_col=0)
    with zipf.open('columns-samples.csv') as sample_file:
        sample_meta = pd.read_csv(sample_file, index_col=0)
    with zipf.open('rows-genes.csv') as gene_file:
        gene_meta = pd.read_csv(gene_file, index_col=0)

In [None]:
matrix.head()

In [None]:
matrix.shape

In [None]:
sample_meta.head()

In [None]:
sample_meta.shape

In [None]:
gene_meta.head()

## Load Donor Metadata

In [None]:
%%appyter code_exec

donor_meta = pd.read_csv(
    {{donor_file}}, 
    index_col=0
)

In [None]:
donor_meta.head()

In [None]:
donor_meta.shape

# Pre-process Data

## Select Cohort

In [None]:
%%appyter code_exec

if '{{cohort}}' == 'Healthy':
    cohort = np.logical_and(donor_meta['ever_tbi_w_loc'] == 'N', 
                            donor_meta['act_demented'] == 'No Dementia')
if '{{cohort}}' == 'Disease':
    cohort = np.logical_or(donor_meta['ever_tbi_w_loc'] == 'Y', 
                            donor_meta['act_demented'] == 'Dementia')
if '{{cohort}}' == 'All':
    cohort = donor_meta['ever_tbi_w_loc'].astype('bool')

donor_meta = donor_meta[cohort]
sample_cohort = sample_meta['donor_id'].isin(donor_meta.index)
sample_meta = sample_meta[sample_cohort]

## Map Sample Meta to Sample ID

In [None]:
matrix.columns = matrix.columns.astype('int')
matrix_cohort = matrix.columns.isin(sample_meta.index)
matrix = matrix[matrix.columns[matrix_cohort]]
matrix.head()

## Map Gene to Row

In [None]:
matrix.index = gene_meta['gene_symbol']
matrix.index.name = 'Gene Symbol'
matrix.columns.name = 'RNA-Seq Profile ID'
matrix.head()

## Save Unfiltered Matrix to file

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_unfiltered', 
            compression='npz', dtype=np.float32)

# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
matrix = uf.map_symbols(matrix, symbol_lookup)
matrix.shape

## Merge Duplicate Genes By Rows and Duplicate Columns

In [None]:
matrix = uf.merge(matrix, 'row')
matrix = uf.merge(matrix, 'column')
matrix.shape

## Remove Data that is More Than 95% Missing and Impute Missing Data

In [None]:
matrix = uf.remove_impute(matrix)
matrix.head()

In [None]:
matrix.shape

## Log2 Transform

In [None]:
matrix = uf.log2(matrix)
matrix.head()

## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantile_normalize(matrix)
matrix.head()

## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix)
matrix.head()

## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

## Save Filtered Matrix

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

# Analyze Data

## Create Gene List

In [None]:
gene_list = uf.gene_list(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(matrix, sample_meta)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.standardized_matrix(matrix)
standard_matrix.head()

In [None]:
uf.save_data(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

## Create Ternary Matrix

In [None]:
ternary_matrix = uf.ternary_matrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.save_data(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'down', path, output_name + '_gene_down_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'down', path, 
                             output_name + '_attribute_down_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(standard_matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            ext='tsv', compression='gzip')

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(standard_matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(standard_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)