# GTEx Tissue

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source: https://www.gtexportal.org/home/datasets

In [2]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [3]:
import sys
import os
from datetime import date
import gzip

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [4]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [5]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [6]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

This notebook was run on: 2020-06-28 
Python version: 3.8.0 (default, Oct 28 2019, 16:14:01) 
[GCC 8.3.0]


# Initialization

### Load Mapping Dictionaries

In [7]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:10<00:00,  3.34s/it]


### Output Path

In [8]:
output_name = 'gtex'

path = 'Output/GTEx'
if not os.path.exists(path):
    os.makedirs(path)

In [9]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the GTEx Portal',
) %}

# Load Data

In [11]:
%%appyter code_exec

matrix = pd.read_csv({{FileField(
    constraint='.*\.gz$',
    name='expression_matrix', 
    label='RNA-Seq Gene TPMs (gct.gz)', 
    default='Input/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz',
    section='data')
}}, sep='\t', skiprows=2, usecols=lambda c: c != 'Name', 
    index_col=0, nrows=100)

```python
matrix = pd.read_csv('Input/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz', sep='\t', skiprows=2, usecols=lambda c: c != 'Name',
    index_col=0, nrows=100)
```

In [12]:
matrix.head()

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DDX11L1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.03629,0.0,0.0,0.0,0.0,0.0,0.0,0.01965,0.02522
WASH7P,8.764,3.861,7.349,11.07,3.306,5.389,11.99,16.95,10.04,12.5,...,1.606,2.268,5.386,2.31,2.456,4.023,1.922,2.857,0.8696,2.167
MIR6859-1,0.0,0.0,1.004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIR1302-2HG,0.07187,0.0,0.0,0.06761,0.0,0.0,0.0,0.0,0.0,0.06265,...,0.0,0.0,0.06073,0.0,0.08464,0.1435,0.0,0.05216,0.0,0.0
FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03904,0.0,0.0,...,0.02429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
matrix.shape

(100, 17382)

## Load Sample Metadata

In [15]:
%%appyter code_exec

sample_meta = pd.read_csv({{FileField(
    constraint='.*\.txt$',
    name='sample_metadata', 
    label='Sample Metadata', 
    default='Input/GTEx/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt',
    section='data')
}}, sep='\t', index_col=0, usecols=['SAMPID', 'SMTSD'])

```python

sample_meta = pd.read_csv('Input/GTEx/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', sep='\t', index_col=0, usecols=['SAMPID', 'SMTSD'])
```

In [16]:
sample_meta.head()

Unnamed: 0_level_0,SMTSD
SAMPID,Unnamed: 1_level_1
GTEX-1117F-0003-SM-58Q7G,Whole Blood
GTEX-1117F-0003-SM-5DWSB,Whole Blood
GTEX-1117F-0003-SM-6WBT7,Whole Blood
GTEX-1117F-0011-R10a-SM-AHZ7F,Brain - Frontal Cortex (BA9)
GTEX-1117F-0011-R10b-SM-CYKQ8,Brain - Frontal Cortex (BA9)


In [17]:
sample_meta.shape

(22951, 1)

## Load Subject Metadata

In [18]:
%%appyter code_exec

subject_meta = pd.read_csv({{FileField(
    constraint='.*\.txt$',
    name='subject_metadata', 
    label='Subject Metadata', 
    default='Input/GTEx/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt',
    section='data')
}})

```python

subject_meta = pd.read_csv('Input/GTEx/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt')
```

In [19]:
subject_meta.head()

Unnamed: 0,SUBJID\tSEX\tAGE\tDTHHRDY
0,GTEX-1117F\t2\t60-69\t4
1,GTEX-111CU\t1\t50-59\t0
2,GTEX-111FC\t1\t60-69\t1
3,GTEX-111VG\t1\t60-69\t3
4,GTEX-111YS\t1\t60-69\t0


In [None]:
subject_meta.shape

# Pre-process Data

## Map Sample ID to Attribute

In [None]:
matrix.columns = sample_meta.reindex(matrix.columns).reset_index(drop=True)
matrix.head()

## Save Unfiltered Matrix to file

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_unfiltered',
            compression='gzip', dtype=np.float32)

# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
matrix = uf.map_symbols(matrix, symbol_lookup)
matrix.shape

## Merge Duplicate Genes By Rows and Duplicate Columns

In [None]:
matrix = uf.merge(matrix, 'row')
matrix = uf.merge(matrix, 'column')
matrix.shape

## Remove Data that is More Than 95% Missing and Impute Missing Data

In [None]:
matrix = uf.remove_impute(matrix)
matrix.head()

In [None]:
matrix.shape

## Log2 Transform

In [None]:
matrix = uf.log2(matrix)
matrix.head()

## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantile_normalize(matrix)
matrix.head()

## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix)
matrix.head()

## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

## Save Filtered Matrix

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

# Analyze Data

## Create Gene List

In [None]:
gene_list = uf.gene_list(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.standardized_matrix(matrix)
standard_matrix.head()

In [None]:
uf.save_data(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

## Create Ternary Matrix

In [None]:
ternary_matrix = uf.ternary_matrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.save_data(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'down', path, output_name + '_gene_down_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'down', path, 
                             output_name + '_attribute_down_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf(standard_matrix)
uf.save_data(edge_list, path, output_name + 'edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)