# **Access to TCGA-BRCA Metadata from the GDC API**

This notebook produces the AT_OR family of data artifacts  

- TCGA: The Cancer Genome Atlas  
- BRCA: Breast Invasive Carcinoma  
- GDC: Genomic Data Commons

# Import Libraries and Configurations

In [1]:
import json
import os
import sys

import numpy as np
import pandas as pd
import requests

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..')))

from config import (
    BRCA_PREPROCESSING_PARAMETERS,
    BRCA_RAW_FILES_PATHS,
    GDC_API_ENDPOINTS,
)

# Function

In [2]:
def gdc_api_request(endpoint, fields, filters={}, size=1):
    """
    Send a request to a Genomic Data Commons (GDC) API endpoint with the specified parameters.

    Parameters:
    -----------
    endpoint : str
        The URL of the endpoint to send the request to.
    fields : list of str
        A list of fields to include in the response.
    filters : dict, optional
        A dictionary of filters to apply to the request. Default is an empty dictionary.
    size : int, optional
        The number of results to retrieve. Default is 1.

    Returns:
    --------
    list
        A list of hits (data items) retrieved from the endpoint response.
    """
    # Parameters used in the endpoint request
    params = {
        'fields': ','.join(fields),
        'filters': filters,
        'size': str(size)
    }
    
    # Request the objects of interest to the endpoint
    response = requests.post(
        url=endpoint,
        headers={'Content-Type': 'application/json'},
        json=params
    )
    response.raise_for_status()

    # Convert the response content to a JSON
    json_response = json.loads(response.content.decode('utf-8'))

    return json_response['data']['hits']

# GDC API Requests

## Status Endpoint

In [3]:
# Request the API version and status to the endpoint
response = requests.get(GDC_API_ENDPOINTS['status'])

# Print the response content
try:
    parsed = json.loads(response.content.decode('utf-8'))
    print(json.dumps(parsed, indent=4))
except json.JSONDecodeError:
    print('Response is not valid JSON:')
    print(response.content.decode('utf-8'))

{
    "commit": "b46d9fe84b7f2803af806ac98e49e8b7199a84c7",
    "data_release": "Data Release 43.0 - May 07, 2025",
    "data_release_version": {
        "major": 43,
        "minor": 0,
        "release_date": "2025-05-07"
    },
    "status": "OK",
    "tag": "7.10",
    "version": 1
}


## Projects Endpoint

### Endpoint Response

In [4]:
# Fields available in the endpoint
fields = [
    'disease_type',
    'name',
    'primary_site',
    'summary.case_count',
    'summary.data_categories.case_count',
    'summary.data_categories.data_category',
    'summary.data_categories.file_count',
    'summary.experimental_strategies.case_count',
    'summary.experimental_strategies.experimental_strategy',
    'summary.experimental_strategies.file_count',
    'summary.file_count',
]

# Filter to be used in the projects endpoint request
filters = {
    'op': 'in',
    'content': {
        'field': 'projects.project_id',
        'value': BRCA_PREPROCESSING_PARAMETERS['project_ids']
    }
}

# Get the response from the projects endpoint request
projects_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['projects'],
    fields=fields,
    filters=filters,
    size=len(BRCA_PREPROCESSING_PARAMETERS['project_ids'])
)

### Project DataFrame

In [5]:
# Create the TCGA-BRCA project metadata DataFrame
df_project = pd.json_normalize(projects_response) \
    .rename(columns={
        'id': 'project_id',
        'name': 'project_name',
        'summary.case_count': 'case_count',
        'summary.data_categories': 'data_categories',
        'summary.experimental_strategies': 'experimental_strategies',
        'summary.file_count': 'file_count',
    }) \
    [[
        'project_id',
        'project_name',
        'primary_site',
        'disease_type',
        'experimental_strategies',
        'data_categories',
        'case_count',
        'file_count',
    ]]

# Store the project metadata DataFrame in a CSV file
df_project.to_csv(BRCA_RAW_FILES_PATHS['project'], index=False)

In [6]:
# Print the TCGA-BRCA project metadata DataFrame
pd.set_option('display.max_colwidth', 900)
df_project

Unnamed: 0,project_id,project_name,primary_site,disease_type,experimental_strategies,data_categories,case_count,file_count
0,TCGA-BRCA,Breast Invasive Carcinoma,[Breast],"[Fibroepithelial Neoplasms, Adnexal and Skin Appendage Neoplasms, Adenomas and Adenocarcinomas, Cystic, Mucinous and Serous Neoplasms, Basal Cell Neoplasms, Squamous Cell Neoplasms, Epithelial Neoplasms, NOS, Complex Epithelial Neoplasms, Ductal and Lobular Neoplasms]","[{'file_count': 11079, 'case_count': 1095, 'experimental_strategy': 'RNA-Seq'}, {'file_count': 17049, 'case_count': 1072, 'experimental_strategy': 'WXS'}, {'file_count': 3621, 'case_count': 1079, 'experimental_strategy': 'miRNA-Seq'}, {'file_count': 12688, 'case_count': 952, 'experimental_strategy': 'WGS'}, {'file_count': 75, 'case_count': 74, 'experimental_strategy': 'ATAC-Seq'}, {'file_count': 14329, 'case_count': 1098, 'experimental_strategy': 'Genotyping Array'}, {'file_count': 3714, 'case_count': 1097, 'experimental_strategy': 'Methylation Array'}, {'file_count': 919, 'case_count': 881, 'experimental_strategy': 'Reverse Phase Protein Array'}, {'file_count': 1133, 'case_count': 1062, 'experimental_strategy': 'Diagnostic Slide'}, {'file_count': 1979, 'case_count': 1093, 'experimental_strategy': 'Tissue Slide'}]","[{'file_count': 21437, 'case_count': 1098, 'data_category': 'Simple Nucleotide Variation'}, {'file_count': 9282, 'case_count': 1098, 'data_category': 'Sequencing Reads'}, {'file_count': 5317, 'case_count': 1098, 'data_category': 'Biospecimen'}, {'file_count': 2288, 'case_count': 1098, 'data_category': 'Clinical'}, {'file_count': 14346, 'case_count': 1098, 'data_category': 'Copy Number Variation'}, {'file_count': 4876, 'case_count': 1097, 'data_category': 'Transcriptome Profiling'}, {'file_count': 3714, 'case_count': 1097, 'data_category': 'DNA Methylation'}, {'file_count': 919, 'case_count': 881, 'data_category': 'Proteome Profiling'}, {'file_count': 3128, 'case_count': 927, 'data_category': 'Somatic Structural Variation'}, {'file_count': 5772, 'case_count': 1098, 'data_category': 'Structural Variation'}]",1098,71079


## Cases Endpoint

### Endpoint Response

In [7]:
# Fields of interest for the cases endpoint request
fields = ['disease_type', 'submitter_id',]

# Filter to be used in the cases endpoint request
filters = {
    'op': 'in',
    'content': {
        'field': 'project.project_id',
        'value': BRCA_PREPROCESSING_PARAMETERS['project_ids']
    }
}

# Get the response from the cases endpoint request
cases_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['cases'],
    fields=fields,
    filters=filters,
    size=sum(df_project['case_count'])
)

### Cases DataFrame

In [8]:
# Create the TCGA-BRCA cases metadata DataFrame
df_cases = pd.json_normalize(cases_response) \
    .rename(columns={'id': 'case_id'}) \
    [['case_id', 'submitter_id', 'disease_type']]
    
# Store the cases metadata DataFrame in a CSV file
df_cases.to_csv(BRCA_RAW_FILES_PATHS['cases'], index=False)

In [9]:
# Print the TCGA-BRCA cases metadata DataFrame
df_cases

Unnamed: 0,case_id,submitter_id,disease_type
0,b1d44c81-747d-471f-9093-aeb262a17975,TCGA-Z7-A8R6,"Epithelial Neoplasms, NOS"
1,b205bba0-1870-4458-9088-8817e20389fe,TCGA-A8-A09A,Ductal and Lobular Neoplasms
2,b205c89f-af62-4186-acad-ed23d243fa98,TCGA-A2-A0YL,Ductal and Lobular Neoplasms
3,b26d41cd-393b-4cd4-8925-a9488f7de576,TCGA-C8-A1HE,Ductal and Lobular Neoplasms
4,b2aac45b-2073-4c7a-adb9-769a4fdcc111,TCGA-E9-A1NH,Ductal and Lobular Neoplasms
...,...,...,...
1093,af5453a9-cf1f-40de-aec4-0e0710908fb7,TCGA-A8-A0AD,Ductal and Lobular Neoplasms
1094,af97e043-88cb-4f99-8f8b-9bcbcccdf842,TCGA-EW-A2FR,Ductal and Lobular Neoplasms
1095,b0700958-5f90-4546-b35f-635cd506889b,TCGA-PE-A5DD,Ductal and Lobular Neoplasms
1096,b0bcb829-cee0-4247-bb75-093a5bea89ee,TCGA-E2-A105,Ductal and Lobular Neoplasms


## Files Endpoint

### Endpoint Response

In [10]:
# Fields of interest for the files endpoint request
fields = [
    'access',
    'cases.case_id',
    'cases.samples.sample_type',
    'cases.samples.tissue_type',
    'data_format',
    'data_type',
    'experimental_strategy',
]

# Filter to be used in the files endpoint request
filters = {
    'op': 'in',
    'content': {
        'field': 'cases.project.project_id',
        'value': BRCA_PREPROCESSING_PARAMETERS['project_ids']
    }
}

# Get the response from the files endpoint request
files_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['files'],
    fields=fields,
    filters=filters,
    size=sum(df_project['file_count'])
)

### Files DataFrame

In [11]:
# Create the TCGA-BRCA files metadata DataFrame
df_files = pd.json_normalize(files_response) \
    .rename(columns={'id': 'file_id'})

# Explode lists with dictionaries containing cases metadata
df_files = df_files.explode('cases')

# Explode dictionary contents with cases metadata into columns
df_files = pd.concat(
    objs=[
        df_files.reset_index(drop=True),
        pd.json_normalize(df_files['cases'])
    ],
    axis='columns'
)

# Reorganize the DataFrame columns
df_files = df_files \
    .drop(columns=['cases']) \
    [[
        'file_id',
        'case_id',
        'access',
        'experimental_strategy',
        'data_type',
        'data_format',
        'samples',
    ]]

# Store the files metadata DataFrame in a CSV file
df_files.to_csv(BRCA_RAW_FILES_PATHS['files'], index=False)

In [12]:
# Print the TCGA-BRCA files metadata DataFrame
pd.set_option('display.max_colwidth', 300)
df_files

Unnamed: 0,file_id,case_id,access,experimental_strategy,data_type,data_format,samples
0,75a8669a-692c-468c-8469-d2e61f6d37d8,cc348a26-ee11-47a4-8b51-de922967e175,open,Methylation Array,Masked Intensities,IDAT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
1,7abd94ba-570e-43d4-9360-ad2d8399c960,f2bbfa9d-9a9d-4f46-9fde-378e4c44e2ad,controlled,WGS,Structural Rearrangement,BEDPE,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
2,97ef35dc-9c03-49ce-8103-5e8c55be8805,f2bbfa9d-9a9d-4f46-9fde-378e4c44e2ad,controlled,RNA-Seq,Transcript Fusion,TSV,"[{'sample_type': 'Solid Tissue Normal', 'tissue_type': 'Normal'}]"
3,0518551d-4df2-4124-b68d-494200c5586b,a6edb6ca-ae9f-4da7-8ebe-92d83d2987fb,open,WXS,Masked Somatic Mutation,MAF,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}, {'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}]"
4,2044bb65-05cf-4ef7-9543-3b7c5c2ff4d5,ec0ab947-9341-4fff-bda4-fdfb9434d508,open,Methylation Array,Masked Intensities,IDAT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
...,...,...,...,...,...,...,...
91917,c680a261-9fdf-48dd-9e87-274881d700c8,da70cf7e-0e61-4c72-b4c5-c408569d11b8,open,miRNA-Seq,miRNA Expression Quantification,TXT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
91918,ad76f12b-1710-4e83-8cf2-21c1b6863d8e,d9627184-b972-4e00-8c11-b0b946ac357a,controlled,WGS,Aligned Reads,BAM,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}]"
91919,42eacf87-e962-4b33-859e-29944271e335,d9627184-b972-4e00-8c11-b0b946ac357a,open,Genotyping Array,Masked Copy Number Segment,TXT,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}]"
91920,b49a041c-2f46-412f-b10d-b0dcb40737cb,d9627184-b972-4e00-8c11-b0b946ac357a,open,,Biospecimen Supplement,BCR SSF XML,
