# **Exploring the GDC API Endpoints**
GDC: Genomic Data Commons

# Importing Libraries

In [1]:
import io
import json

import requests

# Constants

In [2]:
# Base URL HTML from GDC API
GDC_API_URL = 'https://api.gdc.cancer.gov'

# --- STATUS ---
# "Get the API status and version information"
STATUS_ENDPOINT = f'{GDC_API_URL}/status'

# --- SEARCH & RETRIEVAL ---
# "Search all data generated by a project"
PROJECTS_ENDPOINT = f'{GDC_API_URL}/projects'

# "Find all files related to a specific case, or sample donor"
CASES_ENDPOINT = f'{GDC_API_URL}/cases'

# "Find all files with specific characteristics such as file_name, md5sum, data_format and others"
FILES_ENDPOINT = f'{GDC_API_URL}/files'

# "Search annotations added to data after curation"
ANNOTATIONS_ENDPOINT = f'{GDC_API_URL}/annotations'

# --- DOWNLOAD ---
# "Used to download GDC data"
DATA_ENDPOINT = f'{GDC_API_URL}/data'

# "Generates manifests for use with GDC Data Transfer Tool"
MANIFEST_ENDPOINT = f'{GDC_API_URL}/manifest'

# --- BAM SLICING ---
# "Allows remote slicing of BAM format objects"
SLICING_ENDPOINT = f'{GDC_API_URL}/slicing'

# --- SUBMISSION ---
# "Returns the available resources at the top level above programs i.e., registered programs"
SUBMISSION_ENDPOINT = f'{GDC_API_URL}/submission'

# Endpoints of Interest

## `status`

In [3]:
# Get the API version and status
response = requests.get(STATUS_ENDPOINT)

# Print the response content
print(response.content.decode('utf-8'))

{"commit":"48add4be7ac46e7db10e0c6f0e3010d5bb2a50aa","data_release":"Data Release 42.0 - January 30, 2025","data_release_version":{"major":42,"minor":0,"release_date":"2025-01-30"},"status":"OK","tag":"7.7","version":1}



## `projects`

In [4]:
# Request the information contained in the endpoint
response = requests.get(f'{PROJECTS_ENDPOINT}/_mapping')

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Print all endpoint fields
json_response_str = json.dumps(json_response['fields'], indent=2)
print(json_response_str)

[
  "dbgap_accession_number",
  "disease_type",
  "intended_release_date",
  "name",
  "primary_site",
  "program.dbgap_accession_number",
  "program.name",
  "program.program_id",
  "project_autocomplete",
  "project_id",
  "releasable",
  "released",
  "state",
  "summary.case_count",
  "summary.data_categories.case_count",
  "summary.data_categories.data_category",
  "summary.data_categories.file_count",
  "summary.experimental_strategies.case_count",
  "summary.experimental_strategies.experimental_strategy",
  "summary.experimental_strategies.file_count",
  "summary.file_count",
  "summary.file_size"
]


In [5]:
# Fields available in the endpoint
fields = [
    'dbgap_accession_number',
    'disease_type',
    'intended_release_date',
    'name',
    'primary_site',
    'program.dbgap_accession_number',
    'program.name',
    'program.program_id',
    'project_autocomplete',
    'project_id',
    'releasable',
    'released',
    'state',
    'summary.case_count',
    'summary.data_categories.case_count',
    'summary.data_categories.data_category',
    'summary.data_categories.file_count',
    'summary.experimental_strategies.case_count',
    'summary.experimental_strategies.experimental_strategy',
    'summary.experimental_strategies.file_count',
    'summary.file_count',
    'summary.file_size'
]
fields = ','.join(fields)

# Request an object from the endpoint
response = requests.post(
    url=PROJECTS_ENDPOINT,
    headers={'Content-Type': 'application/json'},
    json={'fields': fields, 'size': '1'}
)

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))['data']['hits']

# Print the response content
json_response_str = json.dumps(json_response, indent=2)
print(json_response_str)

[
  {
    "id": "HCMI-CMDC",
    "summary": {
      "file_count": 20289,
      "data_categories": [
        {
          "file_count": 8450,
          "case_count": 278,
          "data_category": "Simple Nucleotide Variation"
        },
        {
          "file_count": 3512,
          "case_count": 278,
          "data_category": "Sequencing Reads"
        },
        {
          "file_count": 278,
          "case_count": 278,
          "data_category": "Clinical"
        },
        {
          "file_count": 544,
          "case_count": 259,
          "data_category": "Biospecimen"
        },
        {
          "file_count": 1393,
          "case_count": 268,
          "data_category": "Copy Number Variation"
        },
        {
          "file_count": 1742,
          "case_count": 262,
          "data_category": "Transcriptome Profiling"
        },
        {
          "file_count": 1353,
          "case_count": 267,
          "data_category": "DNA Methylation"
        },
        {
 

## `cases`

In [6]:
# Request the information contained in the endpoint
response = requests.get(f'{CASES_ENDPOINT}/_mapping')

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Print all endpoint fields
json_response_str = json.dumps(json_response['fields'], indent=2)
print(json_response_str)

[
  "aliquot_ids",
  "analyte_ids",
  "annotations.annotation_id",
  "annotations.case_id",
  "annotations.case_submitter_id",
  "annotations.category",
  "annotations.classification",
  "annotations.created_datetime",
  "annotations.creator",
  "annotations.entity_id",
  "annotations.entity_submitter_id",
  "annotations.entity_type",
  "annotations.legacy_created_datetime",
  "annotations.legacy_updated_datetime",
  "annotations.notes",
  "annotations.state",
  "annotations.status",
  "annotations.submitter_id",
  "annotations.updated_datetime",
  "case_autocomplete",
  "case_id",
  "consent_type",
  "created_datetime",
  "days_to_consent",
  "days_to_lost_to_followup",
  "demographic.age_at_index",
  "demographic.age_is_obfuscated",
  "demographic.cause_of_death",
  "demographic.cause_of_death_source",
  "demographic.country_of_birth",
  "demographic.country_of_residence_at_enrollment",
  "demographic.created_datetime",
  "demographic.days_to_birth",
  "demographic.days_to_death",
  

In [7]:
# Fields of interest for the endpoint request
fields = [
    'case_id',
    'disease_type',
    'files.access',
    'files.created_datetime',
    'files.data_category',
    'files.data_format',
    'files.data_type',
    'files.file_id',
    'files.experimental_strategy',
    'files.updated_datetime',
    'primary_site',
    'project.project_id',
    'samples.tissue_type',
    'samples.tumor_descriptor',
    'samples.sample_type',
    'submitter_id'
]
fields = ','.join(fields)

# Parameters for the endpoint request
params = {
    'fields': fields,
    'format': 'json'
}

# Request an object from the endpoint
response = requests.post(
    url=CASES_ENDPOINT,
    headers={'Content-Type': 'application/json'},
    json={'fields': fields, 'size': '1'}
)

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))['data']['hits']

# Print the response content
json_response_str = json.dumps(json_response, indent=2)
print(json_response_str)

[
  {
    "id": "69eced5b-1e76-45c9-bc9c-2aa71a921c57",
    "primary_site": "Brain",
    "disease_type": "Gliomas",
    "case_id": "69eced5b-1e76-45c9-bc9c-2aa71a921c57",
    "project": {
      "project_id": "HCMI-CMDC"
    },
    "submitter_id": "HCM-BROD-0011-C71",
    "files": [
      {
        "data_format": "BAM",
        "access": "controlled",
        "updated_datetime": "2023-07-12T10:35:29.312841-05:00",
        "file_id": "179ef928-eed6-4a72-abae-8860d0c1aab3",
        "data_type": "Aligned Reads",
        "data_category": "Sequencing Reads",
        "experimental_strategy": "RNA-Seq",
        "created_datetime": "2022-04-07T17:23:39.507516-05:00"
      },
      {
        "data_format": "VCF",
        "access": "controlled",
        "updated_datetime": "2024-07-27T06:57:09.082206-05:00",
        "file_id": "5cda0751-c3f6-4087-9aec-e0f96f51717c",
        "data_type": "Annotated Somatic Mutation",
        "data_category": "Simple Nucleotide Variation",
        "experimental_str

## `files`

In [8]:
# Request the information contained in the endpoint
response = requests.get(f'{FILES_ENDPOINT}/_mapping')

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Print all endpoint fields
json_response_str = json.dumps(json_response['fields'], indent=2)
print(json_response_str)

[
  "access",
  "acl",
  "analysis.analysis_id",
  "analysis.analysis_type",
  "analysis.created_datetime",
  "analysis.input_files.access",
  "analysis.input_files.average_base_quality",
  "analysis.input_files.average_insert_size",
  "analysis.input_files.average_read_length",
  "analysis.input_files.cancer_dna_fraction",
  "analysis.input_files.channel",
  "analysis.input_files.chip_id",
  "analysis.input_files.chip_position",
  "analysis.input_files.contamination",
  "analysis.input_files.contamination_error",
  "analysis.input_files.created_datetime",
  "analysis.input_files.data_category",
  "analysis.input_files.data_format",
  "analysis.input_files.data_type",
  "analysis.input_files.error_type",
  "analysis.input_files.experimental_strategy",
  "analysis.input_files.file_id",
  "analysis.input_files.file_name",
  "analysis.input_files.file_size",
  "analysis.input_files.genome_doubling",
  "analysis.input_files.imaging_date",
  "analysis.input_files.magnification",
  "analysis

In [9]:
# Fields of interest for the endpoint request
fields = [
    'access',
    'cases.case_id',
    'cases.samples.tissue_type',
    'cases.samples.tumor_descriptor',
    'cases.samples.sample_type',
    'created_datetime',
    'data_category',
    'data_format',
    'data_type',
    'experimental_strategy',
    'file_id',
    'file_name',
    'state',
    'updated_datetime'
]
fields = ','.join(fields)

# Parameters for the endpoint request
params = {
    'fields': fields,
    'format': 'json'
}

# Request some objects from the endpoint
response = requests.post(
    url=FILES_ENDPOINT,
    headers={'Content-Type': 'application/json'},
    json={'fields': fields, 'size': '3'}
)

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))['data']['hits']

# Print the response content
json_response_str = json.dumps(json_response, indent=2)
print(json_response_str)

[
  {
    "id": "6adbeaf6-a001-4923-a022-8beec17ccf74",
    "data_format": "BAM",
    "access": "controlled",
    "cases": [
      {
        "case_id": "3b348a82-127f-42ac-bc41-c55fdf8cac7a",
        "samples": [
          {
            "tumor_descriptor": "Primary",
            "sample_type": "Next Generation Cancer Model",
            "tissue_type": "Tumor"
          }
        ]
      }
    ],
    "updated_datetime": "2023-07-12T10:27:39.704969-05:00",
    "file_name": "a30070a7-6312-4a43-bb15-7559dcdb89e8_wgs_gdc_realn.bam",
    "file_id": "6adbeaf6-a001-4923-a022-8beec17ccf74",
    "data_type": "Aligned Reads",
    "data_category": "Sequencing Reads",
    "state": "released",
    "experimental_strategy": "WGS",
    "created_datetime": "2019-10-11T16:07:54.748320-05:00"
  },
  {
    "id": "025fb946-1d6f-47f6-a512-bbe674f9a09b",
    "data_format": "TXT",
    "cases": [
      {
        "case_id": "56c07b06-c6d3-4c03-9e57-7be636e7cc5c",
        "samples": [
          {
            "tum

## `annotations`

In [10]:
# Request the information contained in the endpoint
response = requests.get(f'{ANNOTATIONS_ENDPOINT}/_mapping')

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Print all endpoint fields
json_response_str = json.dumps(json_response['fields'], indent=2)
print(json_response_str)

[
  "annotation_autocomplete",
  "annotation_id",
  "case_id",
  "case_submitter_id",
  "category",
  "classification",
  "created_datetime",
  "entity_id",
  "entity_submitter_id",
  "entity_type",
  "legacy_created_datetime",
  "legacy_updated_datetime",
  "notes",
  "project.code",
  "project.dbgap_accession_number",
  "project.disease_type",
  "project.intended_release_date",
  "project.name",
  "project.primary_site",
  "project.program.dbgap_accession_number",
  "project.program.name",
  "project.program.program_id",
  "project.project_id",
  "project.releasable",
  "project.released",
  "project.state",
  "state",
  "status",
  "submitter_id",
  "updated_datetime"
]


In [11]:
# Fields available in the endpoint
fields = [
    'annotation_autocomplete',
    'annotation_id',
    'case_id',
    'case_submitter_id',
    'category',
    'classification',
    'created_datetime',
    'entity_id',
    'entity_submitter_id',
    'entity_type',
    'legacy_created_datetime',
    'legacy_updated_datetime',
    'notes',
    'project.code',
    'project.dbgap_accession_number',
    'project.disease_type',
    'project.intended_release_date',
    'project.name',
    'project.primary_site',
    'project.program.dbgap_accession_number',
    'project.program.name',
    'project.program.program_id',
    'project.project_id',
    'project.releasable',
    'project.released',
    'project.state',
    'state',
    'status',
    'submitter_id',
    'updated_datetime'
]
fields = ','.join(fields)

# Request an objet from the endpoint
response = requests.post(
    url=ANNOTATIONS_ENDPOINT,
    headers={'Content-Type': 'application/json'},
    json={'fields': fields, 'size': '1'}
)

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))['data']['hits']

# Print the response content
json_response_str = json.dumps(json_response, indent=2)
print(json_response_str)

[
  {
    "id": "f7db73c4-f4ab-4913-803a-7bef497741b4",
    "entity_submitter_id": "d16d1eef-e0df-4590-bcab-57bd3d5094f6",
    "notes": "Real somatic mutations were mistakenly labeled as LOH (Loss of Heterozygosity) in certain SomaticSniper VCF files.",
    "project": {
      "primary_site": [
        "Stomach",
        "Uterus, NOS",
        "Connective, subcutaneous and other soft tissues",
        "Skin",
        "Other and ill-defined sites",
        "Brain",
        "Bones, joints and articular cartilage of other and unspecified sites",
        "Ovary",
        "Other and unspecified parts of biliary tract",
        "Other and unspecified parts of tongue",
        "Corpus uteri",
        "Rectosigmoid junction",
        "Pancreas",
        "Small intestine",
        "Gallbladder",
        "Kidney",
        "Nasal cavity and middle ear",
        "Esophagus",
        "Liver and intrahepatic bile ducts",
        "Other and unspecified parts of mouth",
        "Rectum",
        "Colon

## `manifest`

In [12]:
# Request the information contained in the endpoint
response = requests.get(f'{MANIFEST_ENDPOINT}/_mapping')

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Print all endpoint fields
json_response_str = json.dumps(json_response, indent=2)
print(json_response_str)

{
  "message": "Data with ids {'_mapping'} not found"
}


## `data`

In [13]:
# Request the information contained in the endpoint
response = requests.get(f'{DATA_ENDPOINT}/_mapping')

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Print all endpoint fields
json_response_str = json.dumps(json_response, indent=2)
print(json_response_str)

{
  "message": "_mapping not found"
}


In [14]:
# UUID of an Isoform Expression Quantification file
file_id = '0cf6cded-942f-4141-a4a5-35afb7082f37'

# Request the file download to the endpoint
response = requests.get(
    url=f'{DATA_ENDPOINT}/{file_id}', 
    headers={'Content-Type': 'application/json'}
)

# Create a file object in memory
file_content = io.BytesIO(response.content)

# Read the file in memory and print it
text_content = file_content.getvalue().decode('utf-8')
print(text_content)

miRNA_ID	isoform_coords	read_count	reads_per_million_miRNA_mapped	cross-mapped	miRNA_region
hsa-let-7a-1	hg38:chr9:94175943-94175961:+	2	0.208676	N	precursor
hsa-let-7a-1	hg38:chr9:94175944-94175961:+	1	0.104338	N	precursor
hsa-let-7a-1	hg38:chr9:94175959-94175981:+	1	0.104338	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175959-94175982:+	1	0.104338	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175959-94175983:+	3	0.313014	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175960-94175979:+	3	0.313014	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175960-94175981:+	1	0.104338	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175960-94175982:+	41	4.277859	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175960-94175983:+	83	8.660056	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175960-94175984:+	2	0.208676	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175961-94175977:+	7	0.730366	N	mature,MIMAT0000062
hsa-let-7a-1	hg38:chr9:94175961-94175978:+	9	0.939042	N	mature,MIMAT0000062
hsa-let-7a-1	h