In [8]:
import requests
import json
import os
import re
import gzip
import shutil
import tarfile
import pathlib
import pandas as pd
import numpy as np
from maayanlab_bioinformatics.harmonization import ncbi_genes
import math

In [9]:
# Endpoints
base_url = 'https://api.gdc.cancer.gov/'
files_endpt = base_url + 'files/'
genes_endpt = base_url + 'genes/'
cases_endpt = base_url + 'cases/'
data_endpt = base_url + "data/"

In [None]:
# Get info on properties of the returned objects for an endpoint

# files:

# data = requests.get(files_endpt + "_mapping").json()["fields"]

# print(json.dumps(data, indent=2))

# RNA-Seq Data
---

For now, we'll collect data for one cancer type to save space.

In [11]:

# Retrieve data from the TCGA API

# data type of files we want
data_type = "htseq.counts"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = "file_id,file_name,cases.case_id"

# filter files for only RNA-Seq results
filters = {
    "op": "and",
     "content":[
         {
            "op": "in",
            "content":
             {
                 "field": "files.experimental_strategy", 
                 "value": ["RNA-Seq"],
             }
         },
         {
            "op": "in",
            "content":
             {
                 "field": "access", 
                 "value": "open",
                 
             }
         },
         {
            "op": "in",
            "content":
             {
                 "field": "files.file_name", 
                 "value": ["*htseq.counts.gz"],
             }
         },
         {
            "op": "in",
            "content":
             {
                 "field": "cases.diagnoses.primary_diagnosis", 
                 "value": ["Adenocarcinoma, NOS"],
             }
         },
         
         
     ],
}

# build parameters object
params = {
    "fields": fields,
    "filters": json.dumps(filters),
    "size": 1000
}

# get list of all files with RNA-seq results
response = requests.get(files_endpt, params = params) # optionally also provide params argument
data = json.loads(response.content.decode("utf-8"))

# get list of results
results = data["data"]["hits"]

#results = list(filter(lambda x: data_type in x["file_name"], results))

file_uuid_list = [ entry["file_id"] for entry in results]
case_uuid_list = [ entry["cases"][0]["case_id"] for entry in results]
print(len(file_uuid_list))

df_files_cases=pd.DataFrame({"case": case_uuid_list },  index=file_uuid_list)
file_to_case = df_files_cases.to_dict()["case"] # dict mapping file id to case id
# df_files_cases.head()
# df_cases_files.to_csv('files_to_cases.csv', encoding='utf-8')



1000


In [12]:
params = {"ids": file_uuid_list}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"})

# filename is found in the Content-Disposition header of response
response_head_cd = response.headers["Content-Disposition"]
file_name = re.findall("filename=(.+)", response_head_cd)[0]

downloads_folder = "TCGA_downloads/"

# Save .tar.gz zipped file to TCGA_downloads folder
with open(downloads_folder + file_name, "wb") as f_out:
    f_out.write(response.content)

In [13]:
# extract the root tar archive
tar = tarfile.open(downloads_folder + file_name, "r:gz")
tar.extractall("./{}".format(downloads_folder))
folder = file_name.split(".tar.gz")[0]

for tarinfo in tar:
    if (tarinfo.name == "MANIFEST.txt"): continue
    file_id = tarinfo.name.split("/")[0]
    
    # unzip inner .gz files
    with gzip.open(downloads_folder + tarinfo.name, "rb") as f_in:
        with open("data/{}.txt".format(file_to_case[file_id]), "wb") as f_out:
            f_out.write(f_in.read())

tar.close()

In [14]:
# initialize empty df
df = pd.DataFrame({"gene": []})
df = df.set_index("gene")

size = 0
# loop over files, merging with pre-existing data
for file in pathlib.Path('data').glob('*.txt'):
    with open(file, "rb") as f_in:
        new_df = pd.read_csv(f_in, sep = "\t", header = None)
        file_id = re.findall("data/(.+).txt", f_in.name)[0]
        new_df.columns = ["gene", file_id]
        new_df.gene.replace(to_replace = r'\..*$', value = "", regex=True, 
           inplace=True) # collapse all versions of same gene to one gene
        new_df = new_df.set_index("gene")
        df = pd.DataFrame.merge(df, new_df, how="outer", left_on = "gene", right_on = "gene")

# drop rows not corresponding to genes        
non_genes = list(filter(lambda val: not "ENSG" in val,np.array(df.index.values))) 
df = df.drop(non_genes)

In [16]:
df.head(10)

Unnamed: 0_level_0,124c1d54-1836-4f8f-920f-14047376120f,f5319fd5-beae-4ca8-8f42-45fca5e6a2d2,92252560-5984-41f6-a4b5-a3aa8654c0c8,e5f956dd-f49f-435c-83c3-5b1b0a2050ed,17c0ce0f-8227-4119-9b39-ea8db18f5f4b,ae39e358-08d7-4367-ae68-82b469e791e4,ae6c307a-ca04-4618-b270-e8641afd1daa,1122fa92-0073-4f59-9880-3f1a34620758,570ddfb3-2721-42b6-9604-2de6b6090031,07b5663f-9a54-4462-b6c1-6fc8116b8714,...,0780eb43-d5b5-4c3d-9825-beacba5cc723,0a466142-c513-4257-85d4-4bd7cfd0ef29,261c3d74-706e-4751-bd15-8f3c1a402ff0,8ad78c4f-c84c-4f9e-88c9-42a05eb65498,d6974db6-2bd3-4082-b48e-9c60f7fe3a1f,0e747c7e-3621-4d88-847c-86811655d908,4c9e6085-41c3-4cfe-ad3d-5f94196a86e0,fcf64b55-5c4f-4d82-ac6a-4713d01143cb,93a337ae-2bd3-4464-b38f-93dff92d3fde,8e5b1d47-7d92-4d28-8186-4176ada96672
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,2375,2172,4598,2559,2107,5955,1402,2817,3543,7734,...,8671,2753,2058,5650,2994,1977,3078,1268,1074,1895
ENSG00000000005,1,0,0,0,1,0,3,0,22,4,...,8,1,0,12,3,0,3,1,11,0
ENSG00000000419,2872,1593,1685,1373,786,1446,2978,1104,831,2765,...,3090,4190,1355,1977,1136,994,1876,1111,2206,1036
ENSG00000000457,2124,794,1230,747,378,1337,1226,424,499,2658,...,1963,1084,698,842,980,605,1495,520,1033,696
ENSG00000000460,1026,166,181,216,106,310,561,98,156,577,...,595,666,181,183,147,284,294,112,354,240
ENSG00000000938,433,161,114,1589,154,1494,1720,246,156,1216,...,1277,913,770,387,227,432,3323,171,1228,256
ENSG00000000971,4469,134,301,3946,250,3211,11164,1243,806,7710,...,5536,6127,1592,1267,754,893,23071,225,4071,586
ENSG00000001036,6397,2340,5959,3461,3097,4527,3978,1954,3301,6682,...,5484,6807,3145,7478,3089,1415,3715,2264,3584,4098
ENSG00000001084,3474,2194,2331,1217,2535,5286,3073,2821,1507,21488,...,2542,10800,2035,2765,1705,2025,3529,2259,2036,1492
ENSG00000001167,2303,1988,2513,2088,1397,1473,2132,1509,811,3387,...,4144,4591,1784,3237,1989,970,2996,1406,1371,2330


In [None]:
# API only accepts 1000 ids per call

# Convert Ensembl gene IDs to Entrez gene symbols
# mygene_endpt = "http://mygene.info/v3/gene/"
# headers = {'content-type': 'application/x-www-form-urlencoded'}

# ensembl_ids = df.index.to_list()
# build parameters object

# ids = ",".join(ensembl_ids)
# species = "human"
# fields = ",".join(["symbol","name","taxid","entrezgene"])

# params = "ids={}&species={}&fields={}".format(ids,species,fields)
# response = requests.post(mygene_endpt, data=params, headers=headers)
# data = response.json()

# create map of ensembl ID -> entrez symbol
# ensembl_to_symbol = {gene["query"]:gene["symbol"] for gene in data}

In [None]:
# Lots of missing values
# id_chart = pd.read_csv("gene_id_conversion.txt", sep="\t")
# id_chart = id_chart[["Ensembl gene ID", "Approved symbol"]]
# id_chart = id_chart.set_index("Ensembl gene ID")
# ensembl_to_symbol = id_chart.to_dict()["Approved symbol"]

In [17]:
ncbi = pd.DataFrame(ncbi_genes.ncbi_genes_fetch())

def get_ensembl_id(ids):
    ids = "".join(ids)
    ensembl = re.findall("Ensembl:(.*)", ids)
    if (len(ensembl) == 1):
        return ensembl[0]
    else:
        return None

    
all_ids = ncbi.dbXrefs.values
ensembl_ids = [ get_ensembl_id(ids) for ids in all_ids] 

ncbi = ncbi[["dbXrefs", "Symbol", "type_of_gene"]]
ncbi["ensembl"] = ensembl_ids
ncbi = ncbi.drop(columns=["dbXrefs"])
ncbi = ncbi.set_index("ensembl")

ensembl_to_symbol = ncbi.to_dict()["Symbol"]
ensembl_to_gene_type = ncbi.to_dict()["type_of_gene"]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [18]:
data_ensembl_ids = df.index.to_list()

# if the key is present, return it; otherwise, set the index for the corresponding row as its ensembl id
def id_to_symbol(key):
    if (key in ensembl_to_symbol): 
        return ensembl_to_symbol[key]
    else:
        return key # can return key here to maintain some gene identity

def id_to_type(key):
    if (key in ensembl_to_gene_type): 
        return ensembl_to_gene_type[key]
    else:
        return None
    
data_symbols = [ id_to_symbol(key) for key in data_ensembl_ids ]
data_types = [ id_to_type(key) for key in data_ensembl_ids ]

df["symbol"] = data_symbols
df["type_of_gene"] = data_types
df_symbol = df.set_index("symbol")

# drop non protein-coding genes
df_symbol = df_symbol[df_symbol["type_of_gene"] == "protein-coding"]
df_symbol = df_symbol.drop(columns=["type_of_gene"])
df_symbol.head()

# write the final csv, ready for normalization and further preprocessing
df_symbol.to_csv('data.csv', encoding='utf-8')



In [19]:
df_symbol = pd.read_csv("data.csv")
df_symbol.head()

Unnamed: 0,symbol,124c1d54-1836-4f8f-920f-14047376120f,f5319fd5-beae-4ca8-8f42-45fca5e6a2d2,92252560-5984-41f6-a4b5-a3aa8654c0c8,e5f956dd-f49f-435c-83c3-5b1b0a2050ed,17c0ce0f-8227-4119-9b39-ea8db18f5f4b,ae39e358-08d7-4367-ae68-82b469e791e4,ae6c307a-ca04-4618-b270-e8641afd1daa,1122fa92-0073-4f59-9880-3f1a34620758,570ddfb3-2721-42b6-9604-2de6b6090031,...,0780eb43-d5b5-4c3d-9825-beacba5cc723,0a466142-c513-4257-85d4-4bd7cfd0ef29,261c3d74-706e-4751-bd15-8f3c1a402ff0,8ad78c4f-c84c-4f9e-88c9-42a05eb65498,d6974db6-2bd3-4082-b48e-9c60f7fe3a1f,0e747c7e-3621-4d88-847c-86811655d908,4c9e6085-41c3-4cfe-ad3d-5f94196a86e0,fcf64b55-5c4f-4d82-ac6a-4713d01143cb,93a337ae-2bd3-4464-b38f-93dff92d3fde,8e5b1d47-7d92-4d28-8186-4176ada96672
0,TSPAN6,2375,2172,4598,2559,2107,5955,1402,2817,3543,...,8671,2753,2058,5650,2994,1977,3078,1268,1074,1895
1,TNMD,1,0,0,0,1,0,3,0,22,...,8,1,0,12,3,0,3,1,11,0
2,DPM1,2872,1593,1685,1373,786,1446,2978,1104,831,...,3090,4190,1355,1977,1136,994,1876,1111,2206,1036
3,SCYL3,2124,794,1230,747,378,1337,1226,424,499,...,1963,1084,698,842,980,605,1495,520,1033,696
4,C1orf112,1026,166,181,216,106,310,561,98,156,...,595,666,181,183,147,284,294,112,354,240


# Clinical Data
---

In [20]:
# get all demographic fields

cases_fields = requests.get(cases_endpt + "_mapping").json()["fields"]

diagnoses_fields = list(filter(lambda x: "diagnoses" in x, cases_fields))
print(diagnoses_fields)

demographic_fields = list(filter(lambda x: "demographic" in x, cases_fields))
print(demographic_fields)

['diagnoses.non_nodal_regional_disease', 'diagnoses.primary_diagnosis', 'diagnoses.figo_staging_edition_year', 'diagnoses.esophageal_columnar_dysplasia_degree', 'diagnoses.gross_tumor_weight', 'diagnoses.masaoka_stage', 'diagnoses.cog_neuroblastoma_risk_group', 'diagnoses.mitosis_karyorrhexis_index', 'diagnoses.tissue_or_organ_of_origin', 'diagnoses.goblet_cells_columnar_mucosa_present', 'diagnoses.lymph_nodes_tested', 'diagnoses.classification_of_tumor', 'diagnoses.days_to_best_overall_response', 'diagnoses.lymph_nodes_positive', 'diagnoses.micropapillary_features', 'diagnoses.method_of_diagnosis', 'diagnoses.gleason_grade_group', 'diagnoses.enneking_msts_metastasis', 'diagnoses.icd_10_code', 'diagnoses.created_datetime', 'diagnoses.percent_tumor_invasion', 'diagnoses.igcccg_stage', 'diagnoses.inpc_grade', 'diagnoses.peripancreatic_lymph_nodes_tested', 'diagnoses.cog_liver_stage', 'diagnoses.metastasis_at_diagnosis_site', 'diagnoses.lymphatic_invasion_present', 'diagnoses.breslow_thic

In [21]:
# initialize dataframe
demographic_column_names = [ field.split(".")[1] for field in demographic_fields ]
diagnoses_column_names = [ field.split(".")[1] for field in diagnoses_fields ]

columns = list(set([*demographic_column_names,*diagnoses_column_names]))
df_clinical = pd.DataFrame({}, columns=columns)

df_clinical["case_id"] = []

# get demographics and diagnosis data for each case, 
# merging with pre-exisiting dataframe

for case in case_uuid_list:
    fields=",".join([*demographic_fields, *diagnoses_fields])
    params={
        "fields": fields
    }
    response = requests.get(cases_endpt + case, params=params).json()["data"]
    demographic_data = response["demographic"]
    diagnoses_data = response["diagnoses"]
    diagnoses_data = diagnoses_data[0]
    del diagnoses_data["treatments"] # do not load treatment data
    df_case = pd.DataFrame({**demographic_data,**diagnoses_data}, index=[case])
    df_case.head()
    df_case["case_id"] = case
    df_clinical = pd.concat([df_clinical, df_case], join="outer")

df_clinical = df_clinical.set_index("case_id")


In [23]:
# make first column "primary_diagnosis"
cols = ['primary_diagnosis']  + [col for col in df_clinical.columns.values if col != 'primary_diagnosis']
df_clinical = df_clinical[cols]
df_clinical.head()

Unnamed: 0_level_0,primary_diagnosis,primary_gleason_grade,figo_staging_edition_year,updated_datetime,occupation_duration_years,ann_arbor_clinical_stage,best_overall_response,largest_extrapelvic_peritoneal_focus,esophageal_columnar_dysplasia_degree,perineural_invasion_present,...,lymph_node_involved_site,year_of_diagnosis,ann_arbor_b_symptoms,gastric_esophageal_junction_involvement,days_to_birth,treatments,ajcc_clinical_stage,ajcc_pathologic_stage,progression_or_recurrence,ajcc_pathologic_t
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7b47489f-c3cc-4388-b5d7-c7c02790a5f6,"Adenocarcinoma, NOS",,,2019-08-08T16:35:04.140343-05:00,,,,,,,...,,2007,,,-24045,,,Stage IIA,not reported,T3
c616f95f-4ea1-4019-8c17-041878085c88,"Adenocarcinoma, NOS",,,2019-08-08T16:34:38.138144-05:00,,,,,,,...,,2006,,,-22219,,,Stage III,not reported,T2
2b94070a-21ec-457b-88af-3d2a89b97ac6,"Adenocarcinoma, NOS",,,2019-08-08T16:34:38.138144-05:00,,,,,,,...,,2004,,,-23831,,,Stage III,not reported,T3
df5ab6cc-6f68-4b6b-95e2-954c6b57ba9c,"Adenocarcinoma, NOS",,,2019-08-08T16:35:57.350058-05:00,,,,,,,...,,2010,,,-17109,,,Stage IIA,not reported,T3
28d03569-8717-40da-8989-70baf2b18f9f,"Adenocarcinoma, NOS",,,2019-08-08T16:33:45.855164-05:00,,,,,,,...,,2010,,,-14756,,,Stage IIIB,not reported,T3


In [24]:
# save final .csv
df_clinical.to_csv("clinical_data.csv", encoding='utf-8')


In [25]:

# save a df containing all entries for one cancer type for each type
cancer_types = list(set(df_clinical.primary_diagnosis))

for cancer in cancer_types:
    df_cancer = df_clinical[df_clinical.primary_diagnosis == cancer]
    df_cancer.to_csv("data_cancer/{}.csv".format(cancer), encoding='utf-8')

