 # Harmonizome ETL: The Cancer Gene Atlas (TCGA)

 Created by: Charles Dai <br>
 Credit to: Moshe Silverstein

 Data Source: https://gdc.cancer.gov/

In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import sys
import os
from datetime import date
import gzip
import io
import requests
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

 ### Notebook Information

In [None]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

 # Initialization

In [None]:
%%appyter hide_code

{% do SectionField(
    name='settings',
    title='Settings',
    img='setting_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The files used for analysis are downloaded from the NIH Genomic Data Commons Data Portal using the GDC API from <a href="https://gdc.cancer.gov/" target="_blank">gdc.cancer.gov</a>.',
    section='settings'
) %}

{% set project_name = ChoiceField(
    name='project_name',
    label='TCGA Project Name',
    choices=[
        'Acute Myeloid Leukemia',
        'Adrenocortical Carcinoma',
        'Bladder Urothelial Carcinoma',
        'Brain Lower Grade Glioma',
        'Breast Invasive Carcinoma',
        'Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma',
        'Cholangiocarcinoma',
        'Colon Adenocarcinoma',
        'Esophageal Carcinoma',
        'Glioblastoma Multiforme',
        'Head and Neck Squamous Cell Carcinoma',
        'Kidney Chromophobe',
        'Kidney Renal Clear Cell Carcinoma',
        'Kidney Renal Papillary Cell Carcinoma',
        'Liver Hepatocellular Carcinoma',
        'Lung Adenocarcinoma',
        'Lung Squamous Cell Carcinoma',
        'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma',
        'Mesothelioma',
        'Ovarian Serous Cystadenocarcinoma',
        'Pancreatic Adenocarcinoma',
        'Pheochromocytoma and Paraganglioma',
        'Prostate Adenocarcinoma',
        'Rectum Adenocarcinoma',
        'Sarcoma',
        'Skin Cutaneous Melanoma',
        'Stomach Adenocarcinoma',
        'Testicular Germ Cell Tumors',
        'Thymoma',
        'Thyroid Carcinoma',
        'Uterine Carcinosarcoma',
        'Uterine Corpus Endometrial Carcinoma',
        'Uveal Melanoma'
    ],
    default='Acute Myeloid Leukemia',
    section='settings'
) %}

In [None]:
base_url = 'https://api.gdc.cancer.gov/'
files_endpt = base_url + 'files/'
cases_endpt = base_url + 'cases/'
data_endpt = base_url + 'data/'

 ### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups(mapfrom='ensembl')

 ### Output Path

In [None]:
output_name = 'tcga'

path = 'Output/TCGA'
if not os.path.exists(path):
    os.makedirs(path)

 # Load Data

 ## Fetch Data From NCI

In [None]:
%%appyter code_exec

fields = [
    'cases.case_id'
]

filters = {
    'op': 'and',
    'content': [{
        'op': 'in',
        'content': {
            'field': 'experimental_strategy',
            'value': ['RNA-Seq'],
        }
    }, 
    {
        'op': 'in',
        'content': {
            'field': 'access',
            'value': ['open'],
        }
    },
    {
        'op': 'in',
        'content': {
            'field': 'file_name',
            'value': ['*htseq.counts.gz'],
        }
    },
    {
        'op': 'in',
        'content': {
            'field': 'cases.project.name',
            'value': ['{{project_name}}'],
        }
    }
    ],
}

In [None]:
params = {
    'fields': ','.join(fields),
    'filters': json.dumps(filters),
    'size': 100000,
    'facets': 'cases.case_id'
}
response = requests.get('https://api.gdc.cancer.gov/files', params=params)
data = response.json()['data']['hits']

In [None]:
files = pd.DataFrame([(f['id'], f['cases'][0]['case_id']) for f in data], columns=['file_id', 'case_id']).set_index('file_id')
print(files.shape)
files.head()

## Construct Expression Matrix

In [None]:
matrix = pd.DataFrame()

for file_id in tqdm(files.index, unit='samples'):
    response = requests.get(data_endpt + file_id, headers = {"Content-Type": "application/json"})
    string_data = io.StringIO(str(gzip.decompress(response.content), 'utf-8'))
    matrix = pd.concat([matrix, pd.read_csv(string_data, sep='\t', header=None, names=['ENSMBL ID', files.loc[file_id, 'case_id']], index_col=0)], axis=1)
matrix.head()

In [None]:
matrix.index = matrix.index.map(lambda x: x.split('.')[0])
matrix.head()

In [None]:
matrix.shape

## Construct Sample Meta

In [None]:
cases_fields = requests.get(cases_endpt + '_mapping').json()['fields']
keyfields = [field for field in cases_fields if 
    any(word in field for word in ['demographic', 'diagnoses']) and 'treatment' not in field]

In [None]:
sample_meta = pd.DataFrame()
for case_id in tqdm(files['case_id'].drop_duplicates(), unit='cases'):
    response = requests.get(cases_endpt + case_id, params={'fields': ','.join(keyfields)})
    data = response.json()['data']
    sample = pd.DataFrame([{'case_id': case_id, **data['demographic'], **data['diagnoses'][0]}])
    sample_meta = pd.concat([sample_meta, sample])
sample_meta = sample_meta.set_index('case_id')
sample_meta.head()

In [None]:
sample_meta.shape

 # Pre-process Data

In [None]:
matrix.index.name = 'Gene Symbol'
matrix.columns.name = 'Case'
matrix.head()

 ## Save Unfiltered Matrix to file

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_unfiltered',
            compression='gzip', dtype=np.float32)

 # Filter Data

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
matrix = uf.map_symbols(matrix, symbol_lookup)
matrix.shape

 ## Merge Duplicate Genes By Rows and Duplicate Columns

In [None]:
matrix = uf.merge(matrix, 'row')
matrix = uf.merge(matrix, 'column')
matrix.shape

 ## Remove Genes that are More Than 95% Missing or Zero Inference

In [None]:
matrix = matrix.replace(0, np.nan).dropna(
    thresh=0.05 * matrix.shape[1], axis=0).replace(np.nan, 0)
matrix.head()

In [None]:
matrix.shape

 ## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantile_normalize(matrix)
matrix.head()

 ## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix)
matrix.head()

 ## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

 ## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

 ## Save Filtered Matrix

In [None]:
uf.save_data(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

 # Analyze Data

 ## Create Gene List

In [None]:
gene_list = uf.gene_list(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(matrix, sample_meta)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.standardized_matrix(matrix)
standard_matrix.head()

In [None]:
uf.save_data(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

 ## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

 ## Create Ternary Matrix

In [None]:
ternary_matrix = uf.ternary_matrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.save_data(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'gene', 'down', path, output_name + '_gene_down_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.save_setlib(ternary_matrix, 'attribute', 'down', path, 
                             output_name + '_attribute_down_set')

 ## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(standard_matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(standard_matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(standard_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

 # Create Downloadable Save File

In [None]:
uf.archive(path)

 ### Link to download output files: [click here](./output_archive.zip)