In [2]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# TCGA Data Loader
---

An appyter that interfaces with the The Cancer Genome Atlas (TCGA) API to simplify the process of obtaining this dataset's RNA-seq and clinical data tables.

The final output of this notebook is two dictionaries:
1. `cancer_rna_dfs`: maps each cancer type to a pandas DataFrame containing all RNA seq profiles obtained under that type (according to the chosen parameters).
    - For each DataFrame, the **rows** are genes and **columns** are cases (referenced by their case_id).<br>   


2. `cancer_clinical_dfs`: maps each cancer type to a pandas DataFrame containing all clinical data obtained under that type (according to the chosen parameters).
    - For each DataFrame, the **rows** are cases (referenced by case_id) and **columns** are clinical data fields.

For more information on the TCGA dataset, you can browse their [Data Dictionary](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/) and [API Reference](https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/).

*Note*: it is recommended to download the notebook and run it locally,  which you would have to do anyway to be able to access the produced data objects for further analysis or writing to disk.


In [5]:
# Script to acquire RNA-Seq and clinical data from TCGA
import requests
import json
import os
import re
import gzip
import shutil
import tarfile
import pathlib
import pandas as pd
import numpy as np
from maayanlab_bioinformatics.harmonization import ncbi_genes
import math
import io
from gzip import GzipFile
from IPython.display import display
from urllib.error import URLError


In [5]:
%%appyter hide

{% do SectionField(
    name="RNASeq",
    title="RNA-seq specifications",
    img = "tcga-logo.png"
) %}


{% do SectionField(
    name="CLINICAL",
    title="Clincial data specifications",
    img = "tcga-logo.png"
) %}

{% set rna_types =  MultiChoiceField(
    name = "RNA_types",
    label = "Types of RNAs to include in the output",
    choices = ['protein-coding','pseudo', 'other', 'unknown','ncRNA','tRNA','rRNA','scRNA','snoRNA','snRNA','biological-region'],
    section="RNASeq",
    default=[],
) %}

{% set clinical_fields = MultiChoiceField(
    name = "clinical_fields",
    label = "Clinical fields to load",
    choices = {
        'demographic': 'Demographics',
        'diagnoses': 'Diagnoses',
        'exposures': 'Exposures'
    },
    section="CLINICAL",
    default=[],
) %}


In [13]:
%%appyter code_eval

rna_types = {{rna_types}}

map_ids = {{ BoolField(
    name = "map_ids",
    label = "Map ensembl IDs to Entrez gene symbols?",
    description = "If a gene symbol is unavailable (as in the case of non protein-coding RNAs), it will be left as its Ensembl ID.",
    default = True,
    section = "RNASeq"
) }}

include_both_ids = {{BoolField(
    name = "include_both_ids",
    label = "Include both ensembl IDs and symbols in final output?",
    default = not map_ids,
    section = "RNASeq"
) }}

include_rna_types = {{BoolField(
    name = "include_rna_types",
    label = "Include RNA types in final output?",
    default = True,
    section = "RNASeq"
) }}
    
    
min_cases = {{IntField(
    name = "min_documents",
    label = "Minimum cases per cancer",
    description = "The minimum number of cases under a single cancer type for that data to be loaded into the dataset.",
    section="RNASeq",
    default=150,
    min=0,
    max=100000
) }}

clinical_fields = {{clinical_fields}}



In [None]:
# Endpoints
base_url = 'https://api.gdc.cancer.gov/'
files_endpt = base_url + 'files/'
genes_endpt = base_url + 'genes/'
cases_endpt = base_url + 'cases/'
data_endpt = base_url + "data/"

json_header = {"Content-Type": "application/json"}

workflow_type = "HTSeq - Counts"

### Download RNA-seq data

In [None]:
# Build params 

filters = {
        "op": "and",
        "content":[
             {
                "op": "=",
                "content":
                 {
                     "field": "files.experimental_strategy",
                     "value": "RNA-Seq",
                 }
             },
             {
                "op": "=",
                "content":
                 {
                     "field": "access",
                     "value": "open",

                 }
             },
             {
                "op": "=",
                "content":
                 {
                     "field": "files.analysis.workflow_type",
                     "value": workflow_type,
                 }
             },
    ]
}


In [None]:
# Get list of all cancer types returned from the search parameters
params = {
    "filters": filters,
    "size":"0",
    "facets":"cases.diagnoses.primary_diagnosis",
}

response = requests.post(files_endpt, data = json.dumps(params), headers=json_header).json() # optionally also provide params argument
buckets = response["data"]["aggregations"]["cases.diagnoses.primary_diagnosis"]["buckets"]

print(f'{len(buckets)} total cancer types\n')

cancer_types = [ y["key"] for y in list(filter(lambda x: x["doc_count"] >= min_cases, buckets)) ]

print(f'{len(cancer_types)} cancer types with at least {min_cases} associated files')

In [None]:
def make_cancer_params(cancer_type):
    # filter results for this cancer type
    cancer_filters = {
        "op": "and",
        "content": [
            *filters["content"],
            {
                "op": "=",
                "content":
                 {
                     "field": "cases.diagnoses.primary_diagnosis",
                     "value": cancer_type,
                 }
             }
        ]
    }
    
    fields = "file_id,file_name,cases.case_id"
    
    # build parameters object
    return {
        "fields": fields,
        "filters": json.dumps(cancer_filters),
        "size": 100000 # do not limit size
    }

  
def get_uuids(cancer_type):
    # get list of all files with RNA-seq results
    response = requests.get(files_endpt, params = make_cancer_params(cancer_type))
    data = json.loads(response.content.decode("utf-8"))
    
    # get list of results
    results = data["data"]["hits"]
        
    # get list of file and case uuids
    file_uuid_list = [ entry["file_id"] for entry in results]
    case_uuid_list = [ entry["cases"][0]["case_id"] for entry in results]
    
    # create a dictionary to map file uuids to case uuids
    files_to_cases = {}
    for entry in results:
        files_to_cases[entry["file_id"]] = entry["cases"][0]["case_id"]        
    
    return file_uuid_list, case_uuid_list, files_to_cases

def get_files(uuid_list,files_to_cases):
    params = {"ids": uuid_list}
    
    df = pd.DataFrame({"ensembl_id": []}).set_index("ensembl_id")
    
    # A POST is used, so the filter parameters can be passed directly as a Dict object.
    response = requests.post(data_endpt, data=json.dumps(params), headers=json_header)

    # filename is found in the Content-Disposition header of response
    response_head_cd = response.headers["Content-Disposition"]
    file_name = re.findall("filename=(.+)", response_head_cd)[0]
    
    # write the file content (bytes) and load the tar.gz file
    file_object = io.BytesIO(response.content)
    tar = tarfile.open(fileobj=file_object)
    
    # access files within the zipped file
    for file in tar.getmembers():
        if (file.name == "MANIFEST.txt"): continue
        f=tar.extractfile(file)
        bytestream=io.BytesIO(f.read())
        got_text = GzipFile(None, 'rb', fileobj=bytestream).read().decode('utf-8')
        
        # set column name to uuid of corresponding case for this file
        file_name = file.name.split("/")[1].split(".")[0]
        
        new_df = pd.read_csv(io.StringIO(got_text), sep="\t",header=None, names=["ensembl_id", file_name])
            
        # collapse all versioned names of genes to just gene name so we can merge
        new_df.ensembl_id.replace(to_replace = r'\..*$', value = "", regex=True, inplace=True)
        new_df = new_df.set_index("ensembl_id")
        
        df = pd.DataFrame.merge(df, new_df, how="outer", left_index = True, right_index = True)
    
    tar.close()
    
    # drop rows not corresponding to genes (i.e. metadata)
    non_genes = list(filter(lambda val: not "ENSG" in val, list(df.index.values)))
    df = df.drop(non_genes)
    
    return df

def get_ncbi_df():
    # Map Ensebml ids to Entrez gene symbols
    ncbi = pd.DataFrame(ncbi_genes.ncbi_genes_fetch())
    all_ids = ncbi.dbXrefs.values

    def get_ensembl_id(ids):
        ids = "".join(ids)
        ensembl = re.findall("Ensembl:(.*)", ids)
        if (len(ensembl) == 1):
            return ensembl[0]
        else:
            return None
        
    ensembl_ids = [ get_ensembl_id(ids) for ids in all_ids]

    ncbi = ncbi[["dbXrefs", "Symbol", "type_of_gene"]]
    ncbi["ensembl"] = ensembl_ids
    ncbi = ncbi.drop(columns=["dbXrefs"])
    ncbi = ncbi.set_index("ensembl")
    
    return ncbi


def map_ncbi_data(df, ncbi, rna_types):
    
    ensembl_to_gene_type = ncbi.to_dict()["type_of_gene"]
    ensembl_to_symbol = ncbi.to_dict()["Symbol"]

    data_ensembl_ids = df.index.to_list()

    def id_to_type(key):
        if (key in ensembl_to_gene_type):
            return ensembl_to_gene_type[key]
        else:
            return None
    
    # if the key is present, return it; otherwise, set the index for the corresponding row as its ensembl id
    def id_to_symbol(key):
        if (key in ensembl_to_symbol):
            return ensembl_to_symbol[key]
        else:
            return key # if entrez symbol not found, keep as ensembl id

    data_types = [ id_to_type(key) for key in data_ensembl_ids ]
    data_symbols = [ id_to_symbol(key) for key in data_ensembl_ids ]

    df["type_of_gene"] = data_types
    df["symbol"] = data_symbols

    df = df[df['type_of_gene'].isin(rna_types)]
    df = df.reindex(columns=(['symbol','type_of_gene'] + list([a for a in df.columns if not a in ['symbol','type_of_gene']] )))
    
    return df

In [12]:
%%appyter code_exec

{% if rna_types.value == [] %}
print("No RNA types were selected for data collection.")
{% else %}  
# note: this will take a while!


try:
    ncbi = get_ncbi_df()

    cancer_rna_dfs = {}

    cancer_cases = {}

    for cancer in cancer_types:
        file_uuid_list, case_uuid_list, files_to_cases = get_uuids(cancer)

        cancer_cases[cancer] = case_uuid_list # save the case ids to retrieve for clinical data

        num_files = len(file_uuid_list)
        num_cases = len(case_uuid_list)

        print(f"{cancer}: \n{num_files} files\n{num_cases} cases\n")

        df_rna = get_files(file_uuid_list,files_to_cases)

        df_rna = map_ncbi_data(df_rna, ncbi, rna_types)

        if (map_ids and not include_both_ids):
            df_rna = df_rna.drop("ensembl_id").set_index("symbol")

        if (not include_rna_types):
            df_rna = df_rna.drop("type_of_gene")

        print(f"Got table for {cancer} with {df_rna.shape[0]} genes")

        # display(df_rna.head())

        cancer_rna_dfs[cancer] = df_rna
except URLError, e:
    print e.code
    print('Network error occurred. Note that these scripts run more effectively if downloaded and run locally.')

    
{% endif %}

### Clinical Data

In [16]:
%%appyter code_exec

{% if clinical_fields.value == [] %}
print("No clinical data fields were selected.")

{% else %}

# get all clinical fields and convert to column names
cases_fields = requests.get(cases_endpt + "_mapping").json()["fields"]

def filter_field(x):
    for field in clinical_fields:
        if field in x: return True
    return False

all_clinical_fields = list(filter(filter_field, cases_fields))
columns = list(set([ x.split(".")[1] for x in all_clinical_fields]))

print(f'{len(columns)} total clinical data columns.')

{% endif %}

In [9]:
%%appyter code_exec

{% if not clinical_fields.value == [] %}

# note: this will also take a while!

cancer_clinical_dfs = {}

try:
    for cancer in cancer_types:

        df_clinical = pd.DataFrame({}, columns=columns)
        df_clinical["case_id"] = []

        # get demographics and diagnosis data for each case,
        # merging with pre-exisiting dataframe
        for case in cancer_cases[cancer]:
            fields=",".join(all_clinical_fields)
            params={
                "fields": fields
            }
            response = requests.get(cases_endpt + case, params=params).json()["data"]
            all_data = {}

            for field_group in clinical_fields:
                data = response[field_group]
                print(data)
                if field_group == "diagnoses":
                    data = response[field_group][0]
                    if "treatments" in data:
                        del data["treatments"] # do not load treatment data
                all_data = {**all_data, **data}

            df_case = pd.DataFrame(all_data, index=[case])
            df_case.head()
            df_case["case_id"] = case
            df_clinical = pd.concat([df_clinical, df_case], join="outer")

        df_clinical = df_clinical.set_index("case_id")

        # make first column "primary_diagnosis" for easy reference
        cols = ['primary_diagnosis']  + [col for col in df_clinical.columns.values if col != 'primary_diagnosis']
        df_clinical = df_clinical[cols]

        cancer_clinical_dfs[cancer] = df_clinical
except URLError, e:
    print e.code
    print('Network error occurred. Note that these scripts run more effectively if downloaded and run locally.')

    
{% endif %}

In [None]:
# utility functions to save the data
# All files are saved to the same directory, with the name "{cancer}_data.csv" and "{cancer}_clinical_data.csv"
# for RNA-seq and clinical data, respectively.

def save_rna_data(cancer_rna_dfs, path):
    for cancer in cancer_types:
        cancer_rna_dfs[cancer].to_csv(f"{path}/{cancer}_data.csv", encoding='utf-8')

def save_clinical_data(cancer_clinical_dfs, path):
    for cancer in cancer_types:
        cancer_clinical_dfs[cancer].to_csv(f"{path}/{cancer}_clinical_data.csv", encoding='utf-8')
