In [1]:
import requests
import json
import re
import os
import pandas as pd

import gzip
import tarfile

from tqdm import tqdm

## Get TCGA Data

In [2]:
def get_files_ids(dataset_name):
    print(f"\tGetting files ids for {dataset_name}")
    filters = {
        "op": "and",
        "content":[
            {
            "op": "in",
            "content":{
                "field": "cases.project.project_id",
                "value": [f"TCGA-{dataset_name}"]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_category",
                "value": ["Transcriptome Profiling"]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_type",
                "value": ["Gene Expression Quantification"]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.experimental_strategy",
                "value": ["RNA-Seq"]
                }
            }
        ]
    }

    params = {
        "filters": json.dumps(filters),
        "fields": "file_id",
        "format": "JSON",
        "size": "10000"
        }
    response = requests.get("https://api.gdc.cancer.gov/files", params = params)
    files_ids = [file_entry["file_id"] for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]]

    return files_ids

In [3]:
def download_files(dataset_name, files_ids, folder=""):
    print(f"\tDownloading files for {dataset_name}")
    params = {"ids": files_ids}
    response = requests.post("https://api.gdc.cancer.gov/data", data = json.dumps(params), headers = {"Content-Type": "application/json"})
    filename = re.findall("filename=(.+)", response.headers["Content-Disposition"])[0]

    with open(f"{os.path.join(folder, dataset_name)}_raw_data.tar.gz", "wb") as output_file:
        output_file.write(response.content)

In [4]:
def download_samples_info(dataset_name, files_ids):
    print(f"\tDownloading samples info for {dataset_name}")
    filters = {
        "op": "and",
        "content":[
            {
            "op": "in",
            "content":{
                "field": "files.file_id",
                "value": files_ids
                }
            }
        ]
    }
    fields = [
        "file_id",
        "file_name",
        "cases.case_id",
        "cases.samples.portions.analytes.aliquots.submitter_id",
        "cases.samples.sample_type"
    ]
    params = {
        "filters": filters,
        "fields": ",".join(fields),
        "format": "TSV",
        "size": "2000"
        }

    response = requests.post("https://api.gdc.cancer.gov/files", json = params)

    samples_info = pd.DataFrame([raw.strip().split("\t") for raw in response.content.decode("utf-8").split("\n")][1:-1], columns=[name.split(".")[-1] for name in response.content.decode("utf-8").split("\n")[0].strip().split("\t")]).drop("id", axis=1) 

    return samples_info  

In [5]:
def extract_raw_counts(dataset_name, samples_info, folder=""):
    filename2sampleID = dict(zip(samples_info.apply(lambda row: row["file_id"]+"/"+row["file_name"], axis=1), samples_info["submitter_id"]))

    raw_counts = None

    with gzip.open(f"{os.path.join(folder, dataset_name)}_raw_data.tar.gz", "rb") as gz_file:
        with tarfile.open(fileobj=gz_file, mode="r") as tar:
            for member in tqdm(tar.getmembers(), desc=f"Extracting raw counts for {dataset_name}"):
                if member.isfile() and member.name.endswith(".tsv"):
                    if raw_counts is None:
                        raw_counts = pd.read_csv(tar.extractfile(member), sep="\t", header=1, skiprows=range(2,6), index_col=0).query("gene_type == 'protein_coding'")[["gene_name", "unstranded"]].rename(columns={"unstranded":filename2sampleID[member.name]})
                    else:
                        raw_counts = pd.concat([raw_counts, pd.read_csv(tar.extractfile(member), sep="\t", header=1, skiprows=range(2,6), index_col=0).query("gene_type == 'protein_coding'")[["unstranded"]].rename(columns={"unstranded":filename2sampleID[member.name]})], axis=1)

    raw_counts.to_csv(f"{os.path.join(folder, dataset_name)}_raw_counts.tsv.gz", sep="\t", compression="gzip")

    return raw_counts

In [6]:
# TCGA_datasets = pd.read_html("https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations")[1].query("`Study Name` != 'Controls' and `Study Name` != 'Miscellaneous' and `Study Name` != 'FFPE Pilot Phase II'").reset_index(drop=True)
# if not os.path.isdir("TCGA"):
#         os.makedirs("TCGA")
# TCGA_datasets.to_csv("TCGA/datasets.tsv", sep="\t")

In [7]:
# dataset_names = TCGA_datasets["Study Abbreviation"]
# only_disease = []
# for dataset_name in dataset_names:
#     if not os.path.isdir("TCGA"):
#         os.makedirs("TCGA")
#     print(f"Dataset: {dataset_name}")
#     files_ids = get_files_ids(dataset_name)
#     download_files(dataset_name, files_ids, folder="TCGA")
#     samples_info = download_samples_info(dataset_name, files_ids, folder="TCGA")
#     # save phenotypes info
#     phenotypes = samples_info[["submitter_id", "sample_type"]].set_index("submitter_id").replace({"Primary Tumor":"Diseased", "Metastatic":"Diseased", "Recurrent Tumor":"Diseased", "Additional - New Primary":"Diseased", "Additional Metastatic":"Diseased", "Solid Tissue Normal":"Healthy"})
#     phenotypes.to_csv(f"TCGA/{dataset_name}_phenotypes.tsv", sep="\t")
#     if all(phenotypes["sample_type"] == "Diseased"):
#         only_disease.append(dataset_name)
#     raw_counts = extract_raw_counts(dataset_name, samples_info, folder="TCGA")
#     os.remove(f"TCGA/{dataset_name}_raw_data.tar.gz")

In [8]:
dataset_name = "GBM"
only_disease = []

print(f"Dataset: {dataset_name}")
files_ids = get_files_ids(dataset_name)
download_files(dataset_name, files_ids)
samples_info = download_samples_info(dataset_name, files_ids)
# save phenotypes info
phenotypes = samples_info[["submitter_id", "sample_type"]].set_index("submitter_id").replace({"Primary Tumor":"Diseased", "Metastatic":"Diseased", "Recurrent Tumor":"Diseased", "Additional - New Primary":"Diseased", "Additional Metastatic":"Diseased", "Solid Tissue Normal":"Healthy"})
phenotypes.to_csv(f"{dataset_name}_phenotypes.tsv", sep="\t")
if all(phenotypes["sample_type"] == "Diseased"):
    only_disease.append(dataset_name)
raw_counts = extract_raw_counts(dataset_name, samples_info)
os.remove(f"{dataset_name}_raw_data.tar.gz")

Dataset: GBM
	Getting files ids for GBM
	Downloading files for GBM
	Downloading samples info for GBM


Extracting raw counts for GBM: 100%|██████████| 176/176 [01:45<00:00,  1.66it/s]


## Get HUGO Data

In [9]:
with open("hugo_genes.tsv", "wb") as output_file:
    response = requests.post("https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_app_name&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit")
    output_file.write(response.content)

## Get Ensembl Data

In [10]:
query = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
			
	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
		<Attribute name = "ensembl_gene_id" />
		<Attribute name = "external_gene_name" />
	</Dataset>
</Query>
"""
url="http://www.ensembl.org/biomart/martservice?query="+query
with open("ensemblId2geneName.tsv", "wb") as output_file:
	response = requests.get(url)
	output_file.write(response.content)