In [1]:
%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

UsageError: Cell magic `%%appyter` not found.


# tcgaDataLoader

An appyter that interfaces with the The Cancer Genome Atlas (TCGA) API to simplify the process of obtaining this dataset's RNA-seq and clinical data tables.

In [133]:
# Script to acquire RNA-Seq and clinical data from TCGA
import requests
import json
import os
import re
import gzip
import shutil
import tarfile
import pathlib
import pandas as pd
import numpy as np
from maayanlab_bioinformatics.harmonization import ncbi_genes
import math
import io
from gzip import GzipFile
from IPython.display import display

In [135]:
%%appyter hide_code

{% do SectionField(
    name="RNASeq",
    title="RNA-seq specifications",
    img = "tcga-logo.png"
) %}


{% do SectionField(
    name="CLINICAL",
    title="Clincial data specifications",
    img = "tcga-logo.png"
) %}

UsageError: Cell magic `%%appyter` not found.


In [136]:
%%appyter code_eval

rna_types = {{MultiChoiceField(
    name = "RNA_types",
    label = "Types of RNAs to include in the output",
    choices = ['protein-coding','pseudo', 'other', 'unknown','ncRNA','tRNA','rRNA','scRNA','snoRNA','snRNA','biological-region'],
    section="RNASeq",
    default=[],
) }}

map_ids = {{BoolField(
    name = "map_ids",
    label = "Map ensembl IDs to Entrez gene symbols?",
    default = True,
    section = "RNASeq"
) }}
    
min_cases = {{IntField(
    name = "min_documents",
    label = "Minimum cases per cancer",
    description = "The minimum number of cases under a single cancer type for that data to be loaded into the dataset.",
    section="RNASeq",
    default=0,
    min=0,
    max=100000
) }}

UsageError: Cell magic `%%appyter` not found.


In [138]:
# Endpoints
base_url = 'https://api.gdc.cancer.gov/'
files_endpt = base_url + 'files/'
genes_endpt = base_url + 'genes/'
cases_endpt = base_url + 'cases/'
data_endpt = base_url + "data/"

json_header = {"Content-Type": "application/json"}

workflow_type = "HTSeq - Counts"

### Download RNA-seq data

In [140]:
# Build params 

filters = {
        "op": "and",
        "content":[
             {
                "op": "=",
                "content":
                 {
                     "field": "files.experimental_strategy",
                     "value": "RNA-Seq",
                 }
             },
             {
                "op": "=",
                "content":
                 {
                     "field": "access",
                     "value": "open",

                 }
             },
             {
                "op": "=",
                "content":
                 {
                     "field": "files.analysis.workflow_type",
                     "value": workflow_type,
                 }
             },
    ]
}


In [141]:
# Get list of all cancer types returned from the search parameters

params = {
    "filters": filters,
    "size":"0",
    "facets":"cases.diagnoses.primary_diagnosis",
}

response = requests.post(files_endpt, data = json.dumps(params), headers=json_header).json() # optionally also provide params argument
buckets = response["data"]["aggregations"]["cases.diagnoses.primary_diagnosis"]["buckets"]

print(f'{len(buckets)} total cancer types\n')

cancer_types = [ y["key"] for y in list(filter(lambda x: x["doc_count"] >= min_cases, buckets)) ]

print(f'{len(cancer_types)} cancer types with at least {min_cases} associated files')

198 total cancer types



NameError: name 'min_cases' is not defined

In [153]:
def make_cancer_params(cancer_type):
    # filter results for this cancer type
    cancer_filters = {
        "op": "and",
        "content": [
            *filters["content"],
            {
                "op": "=",
                "content":
                 {
                     "field": "cases.diagnoses.primary_diagnosis",
                     "value": cancer_type,
                 }
             }
        ]
    }
    
    fields = "file_id,file_name,cases.case_id"
    
    # build parameters object
    return {
        "fields": fields,
        "filters": json.dumps(cancer_filters),
        "size": 100000 # do not limit size
    }

  
def get_uuids(cancer_type):
    # get list of all files with RNA-seq results
    response = requests.get(files_endpt, params = make_cancer_params(cancer_type))
    data = json.loads(response.content.decode("utf-8"))
    
    # get list of results
    results = data["data"]["hits"]
        
    # get list of file and case uuids
    file_uuid_list = [ entry["file_id"] for entry in results]
    case_uuid_list = [ entry["cases"][0]["case_id"] for entry in results]
    
    # create a dictionary to map file uuids to case uuids
    files_to_cases = {}
    for entry in results:
        files_to_cases[entry["file_id"]] = entry["cases"][0]["case_id"]        
    
    return file_uuid_list, case_uuid_list, files_to_cases

def get_files(uuid_list,files_to_cases):
    params = {"ids": uuid_list}
    
    df = pd.DataFrame({"ensembl_id": []}).set_index("ensembl_id")
    
    # A POST is used, so the filter parameters can be passed directly as a Dict object.
    response = requests.post(data_endpt, data=json.dumps(params), headers=json_header)

    # filename is found in the Content-Disposition header of response
    response_head_cd = response.headers["Content-Disposition"]
    file_name = re.findall("filename=(.+)", response_head_cd)[0]
    
    # write the file content (bytes) and load the tar.gz file
    file_object = io.BytesIO(response.content)
    tar = tarfile.open(fileobj=file_object)
    
    # access files within the zipped file
    for file in tar.getmembers():
        if (file.name == "MANIFEST.txt"): continue
        f=tar.extractfile(file)
        bytestream=io.BytesIO(f.read())
        got_text = GzipFile(None, 'rb', fileobj=bytestream).read().decode('utf-8')
        
        # set column name to uuid of corresponding case for this file
        file_name = file.name.split("/")[1].split(".")[0]
        
        new_df = pd.read_csv(io.StringIO(got_text), sep="\t",header=None, names=["ensembl_id", file_name])
            
        # collapse all versioned names of genes to just gene name so we can merge
        new_df.ensembl_id.replace(to_replace = r'\..*$', value = "", regex=True, inplace=True)
        new_df = new_df.set_index("ensembl_id")
        
        df = pd.DataFrame.merge(df, new_df, how="outer", left_index = True, right_index = True)
    
    tar.close()
    
    # drop rows not corresponding to genes (i.e. metadata)
    non_genes = list(filter(lambda val: not "ENSG" in val, list(df.index.values)))
    df = df.drop(non_genes)
    
    return df

def get_ncbi_df():
    # Map Ensebml ids to Entrez gene symbols
    ncbi = pd.DataFrame(ncbi_genes.ncbi_genes_fetch())
    all_ids = ncbi.dbXrefs.values

    def get_ensembl_id(ids):
        ids = "".join(ids)
        ensembl = re.findall("Ensembl:(.*)", ids)
        if (len(ensembl) == 1):
            return ensembl[0]
        else:
            return None
        
    ensembl_ids = [ get_ensembl_id(ids) for ids in all_ids]

    ncbi = ncbi[["dbXrefs", "Symbol", "type_of_gene"]]
    ncbi["ensembl"] = ensembl_ids
    ncbi = ncbi.drop(columns=["dbXrefs"])
    ncbi = ncbi.set_index("ensembl")
    
    display(ncbi.head())
    print(ncbi["type_of_gene"].unique())
    return ncbi



def map_ncbi_data(df, ncbi, rna_types):
    
    ensembl_to_gene_type = ncbi.to_dict()["type_of_gene"]
    ensembl_to_symbol = ncbi.to_dict()["Symbol"]

    data_ensembl_ids = df.index.to_list()

    def id_to_type(key):
        if (key in ensembl_to_gene_type):
            return ensembl_to_gene_type[key]
        else:
            return None
    
    # if the key is present, return it; otherwise, set the index for the corresponding row as its ensembl id
    def id_to_symbol(key):
        if (key in ensembl_to_symbol):
            return ensembl_to_symbol[key]
        else:
            return key # if entrez symbol not found, keep as ensembl id

    data_types = [ id_to_type(key) for key in data_ensembl_ids ]
    data_symbols = [ id_to_symbol(key) for key in data_ensembl_ids ]

    df["type_of_gene"] = data_types
    df["symbol"] = data_symbols

    df = df[df['type_of_gene'].isin(rna_types)]
    df = df.reindex(columns=(['symbol','type_of_gene'] + list([a for a in df.columns if not a in ['symbol','type_of_gene']] )))
    
    return df

In [156]:
# note: this will take a while!
ncbi = get_ncbi_df()
cancer_dfs = {}

for cancer in cancer_types:
    file_uuid_list, case_uuid_list, files_to_cases = get_uuids(cancer)

    num_files = len(file_uuid_list)
    num_cases = len(case_uuid_list)
    
    print(f"{cancer}: \n{num_files} files\n{num_cases} cases\n")
    
    df_rna = get_files(file_uuid_list,files_to_cases)
        
    df_rna = map_ncbi_data(df_rna, ncbi, rna_types)
    
    if (map_ids):
        df_rna = df_rna.drop("ensembl_id").set_index("symbol")
    
    print(f"Got table for {cancer} with {df_rna.shape[0]} genes")

    display(df_rna.head())
    
    cancer_dfs[cancer] = df_rna
    
    

Unnamed: 0_level_0,Symbol,type_of_gene
ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000121410,A1BG,protein-coding
ENSG00000175899,A2M,protein-coding
ENSG00000256069,A2MP1,pseudo
ENSG00000171428,NAT1,protein-coding
ENSG00000156006,NAT2,protein-coding


['protein-coding' 'pseudo' 'other' 'unknown' 'ncRNA' 'tRNA' 'rRNA' 'scRNA'
 'snoRNA' 'snRNA' 'biological-region']
endometrioid adenocarcinoma, nos: 
540 files
540 cases

Got table for endometrioid adenocarcinoma, nos with 60483 genes


Unnamed: 0_level_0,dae5e5d9-b037-4d4a-90e4-b95fbfeed478,b95c9bdd-1cc0-4b8f-959b-4c1de981135c,e250c5cd-1786-4b25-b143-c8da28a3b77c,a638e045-eb92-4f48-afe8-e57f0e4307eb,f2e7119a-29c3-4f45-9af5-28c3610b0385,c13fc444-6e7b-4ae1-9c90-e539d78417ca,ecf78283-0eb3-4710-8e5a-30f56da510bd,51500805-d88a-4f67-9beb-04a7228033c8,98ac488f-bf05-4175-b9f2-b33cb46665e3,f40d273f-5343-4e57-995b-6adcfdc7c7d3
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000000003,4511,1884,1374,1407,2747,3043,3054,558,4576,3712
ENSG00000000005,1,0,1,2,2,3,28,0,33,2
ENSG00000000419,661,470,146,349,584,591,1213,116,1172,456
ENSG00000000457,480,1251,272,227,379,343,523,728,257,161
ENSG00000000460,166,218,180,85,126,156,133,81,381,62


Unnamed: 0_level_0,symbol,type_of_gene,dae5e5d9-b037-4d4a-90e4-b95fbfeed478,b95c9bdd-1cc0-4b8f-959b-4c1de981135c,e250c5cd-1786-4b25-b143-c8da28a3b77c,a638e045-eb92-4f48-afe8-e57f0e4307eb,f2e7119a-29c3-4f45-9af5-28c3610b0385,c13fc444-6e7b-4ae1-9c90-e539d78417ca,ecf78283-0eb3-4710-8e5a-30f56da510bd,51500805-d88a-4f67-9beb-04a7228033c8,98ac488f-bf05-4175-b9f2-b33cb46665e3,f40d273f-5343-4e57-995b-6adcfdc7c7d3
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000000003,TSPAN6,protein-coding,4511,1884,1374,1407,2747,3043,3054,558,4576,3712
ENSG00000000005,TNMD,protein-coding,1,0,1,2,2,3,28,0,33,2
ENSG00000000419,DPM1,protein-coding,661,470,146,349,584,591,1213,116,1172,456
ENSG00000000457,SCYL3,protein-coding,480,1251,272,227,379,343,523,728,257,161
ENSG00000000460,C1orf112,protein-coding,166,218,180,85,126,156,133,81,381,62
