# Allen Brain Atlas Adult Human Brain (RNA-Sequencing)

Author: Moshe Silverstein <br/>
Date: 5-17 <br/>
Data Source: http://human.brain-map.org/static/download

Reviewer: Charles Dai <br>
Updated: 6-20

In [1]:
import sys, datetime
import os
import shutil

import numpy as np
import pandas as pd
import importlib
import matplotlib.pyplot as plt
%matplotlib inline

import utility_functions as uf
import lookup

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


ModuleNotFoundError: No module named 'utility_functions'

In [None]:
#from clustergrammer_widget import *
#net = Network(clustergrammer_widget)

In [None]:
importlib.reload(uf)

### Versions Of Modules In Use

In [None]:
sys.version

# Functions

### Pre-process Data

In [None]:
def mapSampleData(inputDF, sampleMetaDF, Ontology):
    lst = []
    for index in sampleMetaDF.index:
        structure = sampleMetaDF.loc[index, 'ontology_structure_id']
        hemisphere = sampleMetaDF.loc[index, 'hemisphere']
        if type(Ontology.loc[structure, 'name']) == str:
            lst.append(str(Ontology.loc[structure, 'name']))
        else:
            for struc in Ontology.loc[structure, ['name', 'hemisphere']].values:
                if struc[1] == hemisphere:
                    lst.append(str(struc[0]))
    inputDF.columns  = lst

### 

### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Output Path

In [None]:
path = 'Output/ABA'
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

## Donor 9861

In [None]:
matrix_9861 = pd.read_csv('RNAseqCounts.csv', sep=',', header=None, index_col=0)

In [None]:
matrix_9861.head()

In [None]:
matrix_9861.shape

### Load Sample Metadata

In [None]:
sample_meta_9861 = pd.read_csv('SampleAnnot.csv', sep=',')

In [None]:
sample_meta_9861.head()

In [None]:
sample_meta_9861.shape

### Load Sample Ontology

In [None]:
sample_onto_9861 = pd.read_csv('Ontology.csv', sep=',', index_col=0)

In [None]:
sample_onto_9861.head()

In [None]:
sample_onto_9861.shape

### Map Sample Metadata to Sample ID

In [None]:
mapSampleData(matrix_9861, sample_meta_9861, sample_onto_9861)

In [None]:
matrix_9861.head()

## Donor 10021

In [None]:
matrix_10021 = pd.read_csv('RNAseqCounts-2.csv', sep=',', header=None, index_col=0)

In [None]:
matrix_10021.head()

In [None]:
matrix_10021.shape

### Load Sample Metadata

In [None]:
sample_meta_10021 = pd.read_csv('SampleAnnot-2.csv', sep=',')

In [None]:
sample_meta_10021.head()

In [None]:
sample_meta_10021.shape

### Load Sample Ontology

In [None]:
sample_onto_10021 = pd.read_csv('Ontology-2.csv', sep=',', index_col=0)

In [None]:
sample_onto_10021.head()

In [None]:
sample_onto_10021.shape

### Map Sample Metadata to Sample ID

In [None]:
mapSampleData(matrix_10021, sample_meta_10021, sample_onto_10021)

In [None]:
matrix_10021.head()

## Combine Donor Datasets

## Unfiltered

In [None]:
matrix = pd.concat([matrix_9861, matrix_10021], axis=1)

In [None]:
matrix.head()

In [None]:
matrix.shape

### Save Unfiltered to file

In [None]:
uf.saveData(matrix, path, 'aba_brain_matrix_unfiltered', ext='tsv', compression='gzip')

## Filtered

In [None]:
normalized_matrix = pd.concat([matrix_9861, matrix_10021], axis=1)

In [None]:
normalized_matrix.head()

In [None]:
normalized_matrix.shape

# Process Data

## Remove any data with more than 95% missing and imput remaining mssing values to matrix mean

In [None]:
normalized_matrix = uf.removeAndImpute(normalized_matrix)

In [None]:
normalized_matrix.head()

In [None]:
normalized_matrix.shape

## Map Gene Symbols To Up-to-date Approved Gene Symbols

In [None]:
normalized_matrix = uf.mapgenesymbols(normalized_matrix, symbol_lookup)

In [None]:
normalized_matrix.shape

### Merge Duplicate Genes By Rows

In [None]:
normalized_matrix = uf.merge(normalized_matrix, 'row', 'mean')

In [None]:
normalized_matrix.shape

### Merge Like Column (by taking the mean)

In [None]:
normalized_matrix = uf.merge(normalized_matrix, 'column', 'mean')

In [None]:
normalized_matrix.shape

## Log2 Transform

In [None]:
normalized_matrix = uf.log2(normalized_matrix)

In [None]:
normalized_matrix.head()

## Normalize Matrix (Quantile Normalize the matrix for the columns)

In [None]:
normalized_matrix = uf.quantileNormalize(normalized_matrix)

In [None]:
normalized_matrix.head()

## Normalize Matrix (z-score the rows)

In [None]:
normalized_matrix = uf.zscore(normalized_matrix, 'row')

In [None]:
normalized_matrix.head()

## Histogram of First Sample

In [None]:
normalized_matrix[normalized_matrix.columns[0]].hist(bins=100)

## Histogram of First Gene

In [None]:
normalized_matrix.loc[normalized_matrix.index[0]].hist(bins=100)

### Save Filtered Matrix

In [None]:
uf.saveData(normalized_matrix, path, 'aba_brain_matrix_filltered', ext='tsv', compression='gzip')

## Create Gene List

In [None]:
gene_list = uf.createGeneList(normalized_matrix, geneid_lookup)

In [None]:
gene_list.head()

In [None]:
gene_list.shape

### Save Gene List

In [None]:
uf.saveData(gene_list, path, 'aba_brain_gene_list', ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [None]:
attribute_list.head()

In [None]:
attribute_list.shape

### Save Attribute List

In [None]:
uf.saveData(attribute_list, path, 'aba_brain_attribute_list', ext='tsv', compression='gzip')

## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.createStandardizedMatrix(normalized_matrix)

In [None]:
standard_matrix.head()

## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(normalized_matrix[normalized_matrix.columns[0]], standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

### Save Standardized Matrix

In [None]:
uf.saveData(standard_matrix, path, 'aba_brain_standard_matrix', ext='tsv', compression='gzip')

## Create Tertiary Matrix

In [None]:
tertiary_matrix = uf.createTertiaryMatrix(standard_matrix)

In [None]:
tertiary_matrix.head()

### Save Teriary Matrix

In [None]:
uf.saveData(tertiary_matrix, path, 'aba_brain_tertiary_matrix', ext='tsv', compression='gzip')

# Create Gene and Attribute Set Libraries

In [None]:
uf.createUpGeneSetLib(tertiary_matrix, path, 'aba_brain_gene_up_set')

In [None]:
uf.createDownGeneSetLib(tertiary_matrix, path, 'aba_brain_gene_down_set')

In [None]:
uf.createUpAttributeSetLib(tertiary_matrix, path, 'aba_brain_attribute_up_set')

In [None]:
uf.createDownAttributeSetLib(tertiary_matrix, path, 'aba_brain_attribute_down_set')

## Create Attribute Similarity matrix

In [None]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [None]:
attribute_similarity_matix.head()

### Save Attribute Similarity Matrix

In [None]:
uf.saveData(attribute_similarity_matix, path, 'aba_brain_attribute_similarity_matix', ext='tsv', compression='gzip')

In [None]:
# net.load_df(attribute_similarity_matix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [None]:
gene_similarity_matix.head()

### Save Gene Similarity Matrix

In [None]:
uf.saveData(gene_similarity_matix, path, 'aba_brain_gene_similarity_matix', compression='npz', axes=('gene', 'gene'), symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
uf.createGeneAttributeEdgeList(standard_matrix, attribute_list, gene_list, path, 'aba_brain_gene_attribute_edge_list')

# Create Downloadable Save File

In [None]:
shutil.make_archive('output.zip', 'zip', path)

Link to download: [click to download](./output.zip)