 # GTEx Tissue

 Author: Moshe Silverstein <br/>
 Date: 12-2017 <br/>
 Data Source: https://www.gtexportal.org/home/datasets

 Reviewer: Charles Dai <br>
 Updated: 6-20

In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import sys
import os
import gzip

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import utility_functions as uf
import lookup

In [None]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [None]:
%load_ext autoreload
%autoreload 2

 ### Python Version

In [None]:
sys.version

 # Initialization

 ### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

 ### Output Path

In [None]:
output_name = 'gtex'

path = 'Output/GTEx'
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the GTEx Portal',
) %}

 # Load Data

In [None]:
%%appyter code_exec

matrix_zip = {{FileField(
    constraint='.*\.gz$',
    name='expression_matrix', 
    label='RNA-Seq Gene TPMs (gz)', 
    default='Input/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz',
    section='data')
}}
with gzip.open(matrix_zip, 'r') as matrix_file:
    matrix = pd.read_csv(matrix_file, sep='\t', skiprows=2)

In [None]:
matrix.head()

In [None]:
matrix.shape

 ## Load Sample Metadata

In [None]:
%%appyter code_exec

sample_meta = pd.read_csv({{FileField(
    constraint='.*\.txt$',
    name='sample_metadata', 
    label='Sample Metadata', 
    default='Input/GTEx/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt',
    section='data')
}}, sep='\t', index_col=0, usecols='SMTSD')

In [None]:
sample_meta.head()

In [None]:
sample_meta.shape
'''

 ## Load Subject Metadata

In [None]:
%%appyter code_exec

subject_meta = pd.read_csv({{FileField(
    constraint='.*\.txt$',
    name='subject_metadata', 
    label='Subject Metadata', 
    default='Input/GTEx/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt',
    section='data')
}})

In [None]:
subject_meta.head()

In [None]:
subject_meta.shape
'''

 # Pre-process Data

 ## Map Sample ID to Attribute

In [None]:
matrix.columns = sample_meta.reindex(matrix.columns).reset_index(drop=True)
matrix.head()

 ## Save Unfiltered Matrix to file

In [None]:
uf.saveData(matrix, path, output_name + '_matrix_unfiltered',
            compression='gzip', dtype=np.float32)

 # Filter Data

 ## Remove Data that is More Than 95% Missing and Impute Missing Data

In [None]:
matrix = uf.removeAndImpute(matrix)
matrix.head()

In [None]:
matrix.shape

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
matrix = uf.mapgenesymbols(matrix, symbol_lookup)
matrix.shape

 ## Merge Duplicate Genes By Rows and Duplicate Columns

In [None]:
matrix = uf.merge(matrix, 'row', 'mean')
matrix = uf.merge(matrix, 'column', 'mean')
matrix.shape

 ## Log2 Transform

In [None]:
matrix = uf.log2(matrix)
matrix.head()

 ## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantileNormalize(matrix)
matrix.head()

 ## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix, 'row')
matrix.head()

 ## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

 ## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

 ## Save Filtered Matrix

In [None]:
uf.saveData(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

 # Analyze Data

 ## Create Gene List

In [None]:
gene_list = uf.createGeneList(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.saveData(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [None]:
attribute_list = uf.createAttributeList(matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.saveData(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.createStandardizedMatrix(matrix)
standard_matrix.head()

In [None]:
uf.saveData(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

 ## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

 ## Create Ternary Matrix

In [None]:
ternary_matrix = uf.createTernaryMatrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.saveData(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [None]:
uf.createUpGeneSetLib(ternary_matrix, path, output_name + '_gene_up_set')

In [None]:
uf.createDownGeneSetLib(ternary_matrix, path, output_name + '_gene_down_set')

In [None]:
uf.createUpAttributeSetLib(ternary_matrix, path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.createDownAttributeSetLib(ternary_matrix, path, 
                             output_name + '_attribute_down_set')

 ## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.createSimilarityMatrix(matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.saveData(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

 ## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.createSimilarityMatrix(matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.saveData(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [None]:
uf.createGeneAttributeEdgeList(standard_matrix, attribute_list, gene_list, 
                               path, output_name + '_gene_attribute_edge_list')

 # Create Downloadable Save File

In [None]:
uf.createArchive(path)

 ### Link to download output files: [click here](./output_archive.zip)