In [1]:
%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

UsageError: Cell magic `%%appyter` not found.


# tcgaDataLoader

An appyter that interfaces with the The Cancer Genome Atlas (TCGA) API to simplify the process of obtaining this dataset's RNA-seq and clinical data tables.

In [2]:
# Script to acquire RNA-Seq and clinical data from TCGA
import requests
import json
import os
import re
import gzip
import shutil
import tarfile
import pathlib
import pandas as pd
import numpy as np
from maayanlab_bioinformatics.harmonization import ncbi_genes
import math
import io

In [3]:
%%appyter hide_code

{% do SectionField(
    name="RNASeq",
    title="RNA-seq specifications",
    img = "tcga-logo.png"
) %}

{% do SectionField(
    name="Output",
    title="Parameters for the final output format",
    img = "tcga-logo.png"
) %}

{% do SectionField(
    name="CLINICAL",
    title="Clincial data specifications",
    img = "tcga-logo.png"
) %}

UsageError: Cell magic `%%appyter` not found.


In [4]:
%%appyter code_eval

workflow_type = {{ChoiceField(
    name = "workflow_type",
    label = "RNA-Seq workflow type",
    description = "Workflow used to process the RNA-Seq data",
    choices = ["CellRanger - 10x Filtered Counts",
               "CellRanger - 10x Raw Counts",
               "Cufflinks",
               "DEXSeq",
               "HTSeq - Counts",
               "HTSeq - FPKM",
               "HTSeq - FPKM-UQ",
               "Kallisto - HDF5",
               "Kallisto - Quantification",
               "RNA-SeQC - Counts",
               "RNA-SeQC - FPKM",
               "RSEM - Quantification",
               "STAR - Counts",
               "STAR - FPKM",
               "STAR - Smart-Seq2 Counts",
               "zUMIs - Smart-Seq2 Counts"],
    section="RNASeq",
    default="HTSeq - Counts"
) }}
    
rna_types = {{MultiChoiceField(
    name = "RNA_types",
    label = "Types of RNAs to include in the output",
    choices = ["Protein-coding genes", "Non-protein-coding genes"]
    section="RNASeq",
    default=[],
) }}
    
min_documents = {{IntField(
    name = "min_documents",
    label = "Minimum RNA-Seq files per cancer",
    description = "The minimum number of files required for a single cancer type to be loaded into the dataset."
    section="RNASeq",
    default=0,
    min=0
) }}

UsageError: Cell magic `%%appyter` not found.


In [5]:
%%appyter code_eval

UsageError: Cell magic `%%appyter` not found.


In [6]:
# Endpoints
base_url = 'https://api.gdc.cancer.gov/'
files_endpt = base_url + 'files/'
genes_endpt = base_url + 'genes/'
cases_endpt = base_url + 'cases/'
data_endpt = base_url + "data/"

json_header = {"Content-Type": "application/json"}

### Download RNA-seq data

In [7]:
workflow_type  = "HTSeq - Counts"
min_files  = 150
cancer_type = "Pleomorphic carcinoma"

In [8]:
# Build params 

filters = {
        "op": "and",
        "content":[
             {
                "op": "=",
                "content":
                 {
                     "field": "files.experimental_strategy",
                     "value": "RNA-Seq",
                 }
             },
             {
                "op": "=",
                "content":
                 {
                     "field": "access",
                     "value": "open",

                 }
             },
             {
                "op": "=",
                "content":
                 {
                     "field": "files.analysis.workflow_type",
                     "value": workflow_type,
                 }
             },
    ]
}


In [9]:
# Get list of all cancer types returned from the search parameters

params = {
    "filters": filters,
    "size":"0",
    "facets":"cases.diagnoses.primary_diagnosis",
}

response = requests.post(files_endpt, data = json.dumps(params), headers=json_header).json() # optionally also provide params argument
buckets = response["data"]["aggregations"]["cases.diagnoses.primary_diagnosis"]["buckets"]

print(f'{len(buckets)} total cancer types\n')

cancer_types = [ y["key"] for y in list(filter(lambda x: x["doc_count"] >= min_files, buckets)) ]

print(f'{len(cancer_types)} cancer types with at least {min_files} associated files')

198 total cancer types

19 cancer types with at least 150 associated files


In [52]:
def make_cancer_params(cancer_type):
    # filter results for this cancer type
    cancer_filters = {
        "op": "and",
        "content": [
            *filters["content"],
            {
                "op": "=",
                "content":
                 {
                     "field": "cases.diagnoses.primary_diagnosis",
                     "value": cancer_type,
                 }
             }
        ]
    }
    
    fields = "file_id,file_name,cases.case_id"
    
    # build parameters object
    return {
        "fields": fields,
        "filters": json.dumps(cancer_filters),
        "size": 100000 # do not limit size
    }

  
def get_uuids(cancer_type):
    # get list of all files with RNA-seq results
    response = requests.get(files_endpt, params = make_cancer_params(cancer_type))
    data = json.loads(response.content.decode("utf-8"))
    
    # get list of results
    results = data["data"]["hits"]
        
    # get list of file and case uuids
    file_uuid_list = [ entry["file_id"] for entry in results]
    case_uuid_list = [ entry["cases"][0]["case_id"] for entry in results]
    
    return file_uuid_list, case_uuid_list

def get_files(uuid_list):
    params = {"ids": uuid_list}
    
    # A POST is used, so the filter parameters can be passed directly as a Dict object.
    response = requests.post(data_endpt, data=json.dumps(params), headers=json_header)

    # filename is found in the Content-Disposition header of response
    response_head_cd = response.headers["Content-Disposition"]
    file_name = re.findall("filename=(.+)", response_head_cd)[0]
    
    # write the file content (bytes) and load the tar.gz file
    file_object = io.BytesIO(response.content)
    tar = tarfile.open(fileobj=file_object)
    
    # access files within the zipped file
    for file in tar.getmembers():
        if (file.name == "MANIFEST.txt"): continue
        f=tar.extractfile(file)
        content=io.BytesIO(f)
        print(content)
    tar.close()
    

In [53]:
# note: this will take a while!
print(cancer_types)
cancer_files = {}
os.makedirs("data", exist_ok = True)

for cancer in cancer_types[6:7]:
    
    file_uuid_list, case_uuid_list = get_uuids(cancer)

    num_files = len(file_uuid_list)
    num_cases = len(case_uuid_list)
    print(f"{cancer}: \n{num_files} files\n{num_cases} cases\n")
    
    file_name = get_files(file_uuid_list[0:10])
    

['adenocarcinoma, nos', 'squamous cell carcinoma, nos', 'infiltrating duct carcinoma, nos', 'multiple myeloma', 'papillary adenocarcinoma, nos', 'clear cell adenocarcinoma, nos', 'endometrioid adenocarcinoma, nos', 'diffuse large b-cell lymphoma, nos', 'serous cystadenocarcinoma, nos', 'malignant melanoma, nos', 'acute myeloid leukemia, nos', 'hepatocellular carcinoma, nos', 'transitional cell carcinoma', 'glioblastoma', 'precursor b-cell lymphoblastic leukemia', 't lymphoblastic leukemia/lymphoma', 'lobular carcinoma, nos', 'renal cell carcinoma, nos', 'mucinous adenocarcinoma']
endometrioid adenocarcinoma, nos: 
540 files
540 cases



TypeError: a bytes-like object is required, not 'ExFileObject'