# BrainSpan: Atlas of the Developing Human Brain

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source: http://www.brainspan.org/static/download.html

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

/home/charlesdai/Projects/Harmonizome-Data-Processing-Appyters


In [2]:
import sys
import os
import zipfile
from datetime import date

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [5]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

This notebook was run on: 2020-07-04 
Python version: 3.8.0 (default, Oct 28 2019, 16:14:01) 
[GCC 8.3.0]


# Initialization

### Set Data Grouping Options

In [6]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Upload Data',
    subtitle='The following examples were sourced from <a href="http://www.brainspan.org/static/download.html">http://www.brainspan.org/static/download.html</a>.',
    img='load_icon.png'
) %}

In [7]:
%%appyter code_eval

{% set dataset = ChoiceField(
    name='dataset',
    label='Dataset',
    choices={
        'Developmental Microarray': 'DMA',
        'Developmental RNA-Seq': 'DRS', 
        'Prenatal Microarray': 'PMA'
    },
    default='Developmental Microarray',
    section='data'
) %}

In [8]:
%%appyter code_eval

{% set attribute = ChoiceField(
    name='attribute',
    label='Attribute',
    description='The prenatal microarray dataset cannot take the age attribute. All other combinations of dataset and attribute work.',
    choices=['Age', 'Sample', 'Tissue'],
    default='Sample',
    section='data'
) %}

In [22]:
%%appyter code_eval

{% set file_1 = FileField(
    constraint='(.*\.zip$)?', name='dataset1', label='Dataset 1', 
    default='Input/BrainSpan/genes_matrix_csv.zip',
    section='data',
    examples={
    }) 
%}

{% set file_2 = FileField(
    constraint='(.*\.zip$)?', name='dataset2', label='Dataset 2', 
    default='',
    section='data',
    examples={
    })
%}

{% set file_3 = FileField(
    constraint='(.*\.zip$)?', name='dataset3', label='Dataset 3', 
    default='',
    section='data',
    examples={
    })
%}

{% set file_4 = FileField(constraint='(.*\.zip$)?', name='dataset4', label='Dataset 4', 
    default='',
    section='data',
    examples={
    })
%}

### Load Mapping Dictionaries

In [10]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:14<00:00,  4.94s/it]


### Output Path

In [19]:
%%appyter code_exec

output_name = 'brainspan_{{dataset}}'.lower()

path = 'Output/BrainSpain-{{dataset}}'
if not os.path.exists(path):
    os.makedirs(path)

```python
output_name = 'brainspan_DMA'.lower()
path = 'Output/BrainSpain-DMA'
if not os.path.exists(path):
    os.makedirs(path)
```

# Load Data

In [24]:
%%appyter code_exec

file_list = [
    {{file_1}}, 
    {{file_2}}, 
    {{file_3}}, 
    {{file_4}}
]

```python
file_list = [
    'Input/BrainSpan/genes_matrix_csv.zip',
    '',
    '',
    ''
]
```

In [25]:
data = []

for f in file_list:
    if not f == '':
        print('Loading file:', f)
        with zipfile.ZipFile(f) as zipf:
            with zipf.open('expression_matrix.csv') as matrix_file:
                matrix = pd.read_csv(matrix_file, header=None, index_col=0)
            with zipf.open('columns_metadata.csv') as sample_file:
                sample_meta = pd.read_csv(sample_file, index_col=0)
            with zipf.open('rows_metadata.csv') as gene_file:
                gene_meta = pd.read_csv(gene_file, index_col=0)
        data.append((matrix, sample_meta, gene_meta))

Loading file: Input/BrainSpan/genes_matrix_csv.zip


# Pre-process Data

## Map Genes and Attribute to Matrix

In [26]:
%%appyter code_exec

matrices = []

for matrix, sample_meta, gene_meta in data:
    matrix.index = gene_meta['gene_symbol']
    if '{{attribute}}' == 'Age':
        matrix.columns = sample_meta['age']
    elif '{{attribute}}' == 'Tissue':
        matrix.columns = sample_meta['structure_name']
    elif '{{attribute}}' == 'Sample':
        sample_meta = sample_meta.set_index(['donor_id', 'structure_id'])
        matrix.columns = sample_meta.index
    matrices.append(matrix)
matrix = pd.concat(matrices, axis=1)

```python
matrices = []
for matrix, sample_meta, gene_meta in data:
    matrix.index = gene_meta['gene_symbol']
    if 'Sample' == 'Age':
        matrix.columns = sample_meta['age']
    elif 'Sample' == 'Tissue':
        matrix.columns = sample_meta['structure_name']
    elif 'Sample' == 'Sample':
        sample_meta = sample_meta.set_index(['donor_id', 'structure_id'])
        matrix.columns = sample_meta.index
    matrices.append(matrix)
matrix = pd.concat(matrices, axis=1)
```

In [27]:
%%appyter code_exec

matrix.index.name = 'Gene Symbol'
matrix.columns.name = '{{attribute}}'

```python
matrix.index.name = 'Gene Symbol'
matrix.columns.name = 'Sample'
```

In [28]:
matrix.head()

donor_id,13058,13058,13058,13058,13058,13058,13058,13058,13058,13058,...,12304,12304,12304,12304,12304,12304,12304,12304,12304,12304
structure_id,10268,10291,10361,10550,10243,10665,10552,10391,10278,10173,...,10236,10657,10269,10194,10243,10225,10163,10294,10333,10209
Gene Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
TSPAN6,36.447128,24.251253,19.330479,27.668607,19.998231,14.680673,27.548101,16.580183,44.587799,44.943915,...,2.320932,1.781548,2.277359,1.832737,1.555696,2.081944,3.484685,4.816781,3.034464,3.08282
TNMD,0.044081,0.067338,0.0,0.145466,0.185188,0.31118,0.0,0.0,0.473831,0.18122,...,0.758571,0.0,0.061869,0.026876,0.100691,0.140675,0.300576,0.126526,0.0,0.424134
DPM1,34.373239,20.765661,18.734947,22.366394,19.228431,11.020365,25.394607,17.671327,32.9031,38.157569,...,23.769167,20.142132,20.063257,16.575379,18.783516,21.631293,28.00612,28.731717,16.679597,28.866042
SCYL3,4.379337,4.227521,2.551825,3.603764,2.948976,2.405183,3.613642,2.573935,3.483817,3.60983,...,1.593009,1.563377,1.648571,2.231466,2.040326,2.161741,1.275352,1.184766,1.735579,1.500363
C1orf112,3.957119,3.520794,2.037805,3.487035,2.177235,0.999693,3.481555,1.747568,3.74158,3.56065,...,0.583488,0.797376,0.607141,0.575555,0.606445,0.683625,0.495084,0.761265,0.766482,0.468859


In [29]:
matrix.shape

(52376, 524)

## Save Unfiltered Matrix to file

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_unfiltered',
            compression='gzip', dtype=np.float32)

# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [40]:
matrix = uf.map_symbols(matrix, symbol_lookup)
matrix.shape

100%|██████████| 17604/17604 [00:00<00:00, 227420.23it/s]


(17368, 492)

## Merge Duplicate Genes By Rows and Duplicate Columns

In [41]:
matrix = uf.merge(matrix, 'row')
matrix = uf.merge(matrix, 'column')
matrix.shape

(16631, 27)

## Remove Data that is More Than 95% Missing and Impute Missing Data

In [None]:
matrix = uf.remove_impute(matrix)
matrix.head()

In [None]:
matrix.shape

## Log2 Transform

In [None]:
matrix = uf.log2(matrix)
matrix.head()

## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantile_normalize(matrix)
matrix.head()

## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix)
matrix.head()

## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

## Save Filtered Matrix

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

# Analyze Data

## Create Gene List

In [None]:
gene_list = uf.gene_list(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [30]:
%%appyter code_exec

if '{{attribute}}' == 'Sample':
    attribute_list = uf.attribute_list(matrix, sample_meta)
else:
    attribute_list = uf.attribute_list(matrix)

```python
if 'Sample' == 'Sample':
    attribute_list = uf.attribute_list(matrix, sample_meta)
else:
    attribute_list = uf.attribute_list(matrix)
```

In [31]:
attribute_list.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,donor_name,age,gender,structure_acronym,structure_name
donor_id,structure_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13058,10268,H376.IIA.51,8 pcw,M,Ocx,occipital neocortex
13058,10291,H376.IIA.51,8 pcw,M,M1C-S1C,primary motor-sensory cortex (samples)
13058,10361,H376.IIA.51,8 pcw,M,AMY,amygdaloid complex
13058,10550,H376.IIA.51,8 pcw,M,MGE,medial ganglionic eminence
13058,10243,H376.IIA.51,8 pcw,M,STC,posterior (caudal) superior temporal cortex (a...


In [32]:
attribute_list.shape

(524, 5)

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.standardized_matrix(matrix)
standard_matrix.head()

In [None]:
uf.save_data(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

## Create Ternary Matrix

In [None]:
ternary_matrix = uf.ternary_matrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.save_data(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'down', path, output_name + '_gene_down_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'down', path, 
                             output_name + '_attribute_down_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(standard_matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(standard_matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(standard_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)