In [1]:
import requests
import json
import os
import re
import gzip
import shutil
import tarfile
import pathlib
import pandas as pd

In [71]:
# Get info on properties of the returned objects for an endpoint

# files:

data = requests.get(files_endpt + "_mapping").json()["fields"]
# print(json.dumps(data, indent=2))

In [72]:
# Retrieve data from the TCGA API

# Endpoints
base_url = 'https://api.gdc.cancer.gov/'
files_endpt = base_url + 'files/'
genes_endpt = base_url + 'genes/'
cases_endpt = base_url + 'cases/'
data_endpt = base_url + "data/"

# data type of files we want
data_type = "htseq.counts"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = "file_id,file_name,cases.submitter_id,cases.case_id,data_category,data_type,cases.samples.tumor_descriptor,cases.samples.tissue_type,cases.samples.sample_type,cases.samples.submitter_id,cases.samples.sample_id,analysis.workflow_type,cases.project.project_id,cases.samples.portions.analytes.aliquots.aliquot_id,cases.samples.portions.analytes.aliquots.submitter_id"
# fields = ','.join(fields)

# filter files for only RNA-Seq results
filters = {
    "op": "and",
     "content":[
         {
            "op": "in",
            "content":
             {
                 "field": "files.experimental_strategy", 
                 "value": ["RNA-Seq"],
             }
         },
         {
            "op": "in",
            "content":
             {
                 "field": "access", 
                 "value": ["open"],
                 
             }
         },
         
     ],
}

# build parameters object
params = {
    "filters": json.dumps(filters)
}

# get list of all files with RNA-seq results
response = requests.get(files_endpt, params = params) # optionally also provide params argument
data = json.loads(response.content.decode("utf-8"))
# print(json.dumps(data, indent=2))

# get list of results
results = data["data"]["hits"]

results = filter(lambda x: data_type in x["file_name"], results)

file_uuid_list = [ entry["file_id"] for entry in results]

# print(file_uuid_list)

In [73]:
params = {"ids": file_uuid_list}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"})

# filename is found in the Content-Disposition header of response
response_head_cd = response.headers["Content-Disposition"]
file_name = re.findall("filename=(.+)", response_head_cd)[0]

downloads_folder = "TCGA_downloads/"

# Save .tar.gz zipped file to TCGA_downloads folder
with open(downloads_folder + file_name, "wb") as f_out:
    f_out.write(response.content)

In [74]:
# extract the root tar archive
tar = tarfile.open(downloads_folder + file_name, "r:gz")
tar.extractall("./{}".format(downloads_folder))
folder = file_name.split(".tar.gz")[0]

for tarinfo in tar:
    if (tarinfo.name == "MANIFEST.txt"): continue
    file_id = tarinfo.name.split("/")[0]
    
    # unzip inner .gz files
    with gzip.open(downloads_folder + tarinfo.name, "rb") as f_in:
        with open("data/{}.txt".format(file_id), "wb") as f_out:
            f_out.write(f_in.read())

tar.close()

In [75]:
# initialize empty df
df = pd.DataFrame({"gene": []})
df = df.set_index("gene")

# loop over files, merging with pre-existing data
for file in pathlib.Path('data').glob('*.txt'):
    with open(file, "rb") as f_in:
        new_df = pd.read_csv(f_in, sep = "\t", header = None)
        file_id = re.findall("data/(.+).txt", f_in.name)[0]
        new_df.columns = ["gene", file_id]
        new_df = new_df.set_index("gene")
        df = pd.DataFrame.merge(df, new_df, how="outer", left_on = "gene", right_on = "gene")

# export to the data directory
df.to_csv('data.csv', encoding='utf-8')


In [70]:
df.head(10)

Unnamed: 0_level_0,cb76eaa5-36bd-49aa-8301-d752230ea9d6,cf1b82e4-b39a-4860-a8ad-c627bf37312c,b1fe9a4a-1173-4c9e-877d-3a47c1ea414c,a33105b9-57b1-4804-b0cd-de6d490b4bd0,77b33096-992a-4d33-9f1b-9b3eec64621c,15fca218-9237-4fc0-98fc-fb261177d45f,5509fe4d-d702-4eeb-a561-d7994bc629f3,f7abe018-46a4-4588-957e-36c31225182f,ba97be52-eb36-4008-8313-68d5d1e37f9b,a0449ff1-fdc5-44b0-8a9e-db35d1d8dabc
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000242268.2,0.0,0.0,0.052219,0.022265,28,0.0,0,10,0.0,0
ENSG00000270112.3,285.1908,0.0,0.057965,0.704385,11,0.0,0,6,0.0,1
ENSG00000167578.15,48063.33,49255.5,5.10413,1.879191,2750,46775.86,162,484,3.280932,658
ENSG00000273842.1,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0
ENSG00000078237.5,121328.5,200266.4,1.606649,4.324719,354,241207.6,172,54,3.191028,1801
ENSG00000146083.10,625239.3,199240.8,30.230345,21.916833,2354,242443.0,1556,6529,13.549965,3152
ENSG00000225275.4,1724.966,0.0,0.0,0.0,0,1321.938,0,0,0.0,0
ENSG00000158486.12,2763.866,10297.73,0.245353,0.027475,10,6204.803,18,152,0.241807,44
ENSG00000198242.12,3021994.0,4083723.0,180.533342,60.289487,12336,2042142.0,5444,18880,154.80306,13221
ENSG00000259883.1,1261.843,1752.725,0.288526,0.123023,1,2901.063,0,7,0.128544,1
