# **Preprocessing TCGA Data from GDC API**
TCGA: The Cancer Genome Atlas  
GDC: Genomic Data Commons

# Importing Libraries

In [1]:
import json

import pandas as pd
import requests

# Constants and Path

In [2]:
# Base URL HTML from GDC API
GDC_API_URL = 'https://api.gdc.cancer.gov'

# Endpoint to get API status and version
STATUS_ENDPOINT = f'{GDC_API_URL}/status'

# Endpoint of projects related to programs
PROJECTS_ENDPOINT = f'{GDC_API_URL}/projects'

# Endpoint of cases related to projects
CASES_ENDPOINT = f'{GDC_API_URL}/cases'

# Endpoint of files related to cases
FILES_ENDPOINT = f'{GDC_API_URL}/files'

# Interim data folder path
INTERIM_DATA_PATH = '../../data/interim/gdc-api'

# Function

In [3]:
def gdc_api_request(endpoint, fields, filters={}, size=1):
    """
    Send a request to a GDC API endpoint with the specified parameters.

    Parameters:
    -----------
    endpoint : str
        The URL of the endpoint to send the request to.
    fields : list of str
        A list of fields to include in the response.
    filters : dict, optional
        A dictionary of filters to apply to the request. Default is an empty dictionary.
    size : int, optional
        The number of results to retrieve. Default is 1.

    Returns:
    --------
    list
        A list of hits (data items) retrieved from the endpoint response.
    """
    # Parameters used in the endpoint request
    params = {
        'fields': ','.join(fields),
        'filters': filters,
        'size': str(size)
    }
    
    # Request the objects of interest to the endpoint
    response = requests.post(
        url=endpoint,
        headers={'Content-Type': 'application/json'},
        json=params
    )

    # Convert the response content to a JSON
    json_response = json.loads(response.content.decode('utf-8'))

    return json_response['data']['hits']

# API Status

In [4]:
# Request the API version and status to the endpoint
response = requests.get(STATUS_ENDPOINT)

# Print the response content
print(response.content.decode('utf-8'))

{"commit":"48add4be7ac46e7db10e0c6f0e3010d5bb2a50aa","data_release":"Data Release 42.0 - January 30, 2025","data_release_version":{"major":42,"minor":0,"release_date":"2025-01-30"},"status":"OK","tag":"7.7","version":1}



# Projects

## All Projects

In [5]:
# Request the default number of objects to the endpoint
response = requests.post(
    url=PROJECTS_ENDPOINT,
    headers={'Content-Type': 'application/json'},
    json={'fields': 'name'}
)

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Store the total number of projects contained in the GDC API
total_projects = json.dumps(json_response['data']['pagination']['total'])

In [6]:
# Fields of interest for the projects endpoint request
fields = [
    'disease_type',
    'name',
    'primary_site',
    'summary.case_count'
]

# Get the response from the projects endpoint request
projects_response = gdc_api_request(
    endpoint=PROJECTS_ENDPOINT,
    fields=fields,
    size=total_projects
)

## TCGA Projects DataFrame

In [7]:
# Create a projects DataFrame
df_projects = pd.json_normalize(projects_response) \
    .rename(
        columns={
            'id': 'project_id',
            'disease_type': 'project_disease_type',
            'name': 'project_name',
            'primary_site': 'project_primary_site',
            'summary.case_count': 'project_case_count'
        }
    )

# Filter the TCGA projects
filter = df_projects['project_id'].str.contains('TCGA')
df_projects = df_projects[filter] \
    .sort_values(by='project_case_count') \
    .reset_index(drop=True)

In [8]:
# Store the DataFrame in a CSV file
file_name = 'tcga-projects.csv'
df_projects.to_csv(f'{INTERIM_DATA_PATH}/{file_name}', index=False)

# Print the TCGA projects DataFrame
df_projects.head(3)

Unnamed: 0,project_id,project_primary_site,project_disease_type,project_name,project_case_count
0,TCGA-CHOL,"[Other and unspecified parts of biliary tract,...",[Adenomas and Adenocarcinomas],Cholangiocarcinoma,51
1,TCGA-UCS,"[Corpus uteri, Uterus, NOS]","[Complex Mixed and Stromal Neoplasms, Basal Ce...",Uterine Carcinosarcoma,57
2,TCGA-DLBC,"[Testis, Brain, Breast, Heart, mediastinum, an...","[Mature B-Cell Lymphomas, Not Reported]",Lymphoid Neoplasm Diffuse Large B-cell Lymphoma,58


# Cases

## Cases of Interest

In [9]:
# Fields of interest for the cases endpoint request
fields = [
    'disease_type',
    'files.created_datetime',
    'files.data_format',
    'files.data_type',
    'files.file_id',
    'files.experimental_strategy',
    'files.updated_datetime',
    'primary_site',
    'project.project_id',
    'submitter_id'
]

# Filters to be used in the cases endpoint request
filters = {
    'op': 'and',
    'content': [
        {
            'op': 'in',
            'content': {
                'field': 'project.project_id',
                'value': list(df_projects['project_id'])
            }
        },
        {
            'op': 'in',
            'content': {
                'field': 'files.experimental_strategy',
                'value': ['miRNA-Seq', 'RNA-Seq']
            }
        }
    ]
}

# Get the response from the cases endpoint request
cases_response = gdc_api_request(
    endpoint=CASES_ENDPOINT,
    fields=fields,
    filters=filters,
    size=df_projects['project_case_count'].sum()
)

## TCGA Cases of Interest DataFrame

In [10]:
# Create the DataFrame of TCGA cases of interest with associated files
df_cases_and_files = pd.json_normalize(cases_response) \
    .rename(
        columns={
            'disease_type': 'case_disease_type',
            'id': 'case_id',
            'primary_site': 'case_primary_site',
            'project.project_id': 'project_id'
        }
    )

# Create the DataFrame of TCGA cases of interest
df_cases = df_cases_and_files.drop(columns=['files'])

In [11]:
# Print the DataFrame of TCGA cases of interest with associated files
df_cases_and_files.head(3)

Unnamed: 0,case_id,case_primary_site,case_disease_type,submitter_id,files,project_id
0,4298ccdb-2e6d-4267-822d-75b021364084,Kidney,Adenomas and Adenocarcinomas,TCGA-B0-4710,"[{'data_format': 'IDAT', 'updated_datetime': '...",TCGA-KIRC
1,a2663a86-a006-4867-9e88-2b523df48303,Kidney,Adenomas and Adenocarcinomas,TCGA-B8-A54K,"[{'data_format': 'TSV', 'updated_datetime': '2...",TCGA-KIRC
2,439794a8-51bd-4c70-968c-34cf26b90148,Kidney,Adenomas and Adenocarcinomas,TCGA-B0-5113,"[{'data_format': 'CEL', 'updated_datetime': '2...",TCGA-KIRC


In [12]:
# Store the DataFrame of TCGA cases of interest in a CSV file
file_name = 'tcga-cases-of-interest.csv'
df_cases.to_csv(f'{INTERIM_DATA_PATH}/{file_name}', index=False)

# Print the DataFrame of TCGA cases of interest
df_cases

Unnamed: 0,case_id,case_primary_site,case_disease_type,submitter_id,project_id
0,4298ccdb-2e6d-4267-822d-75b021364084,Kidney,Adenomas and Adenocarcinomas,TCGA-B0-4710,TCGA-KIRC
1,a2663a86-a006-4867-9e88-2b523df48303,Kidney,Adenomas and Adenocarcinomas,TCGA-B8-A54K,TCGA-KIRC
2,439794a8-51bd-4c70-968c-34cf26b90148,Kidney,Adenomas and Adenocarcinomas,TCGA-B0-5113,TCGA-KIRC
3,e865d40a-9989-436c-8426-88cc84c863e8,Kidney,Adenomas and Adenocarcinomas,TCGA-CJ-5689,TCGA-KIRC
4,305eaef4-4644-46e3-a696-d2e4a972f691,Kidney,Adenomas and Adenocarcinomas,TCGA-CZ-4865,TCGA-KIRC
...,...,...,...,...,...
10755,7e6cdbc7-26a7-44f2-96fa-094738cbccae,Ovary,"Cystic, Mucinous and Serous Neoplasms",TCGA-04-1370,TCGA-OV
10756,81005bae-686a-4598-8994-49d90ebac56f,Ovary,"Cystic, Mucinous and Serous Neoplasms",TCGA-23-1121,TCGA-OV
10757,818dc159-aba4-46bc-a4ed-68ee0f8c4461,Ovary,"Cystic, Mucinous and Serous Neoplasms",TCGA-25-2042,TCGA-OV
10758,82093ed9-a3c8-4e34-931f-4ec7ae745711,Ovary,"Cystic, Mucinous and Serous Neoplasms",TCGA-13-1512,TCGA-OV


# Files

## Files of Interest DataFrame

In [13]:
# Explode lists with dictionaries containing case files metadata 
df_cases_and_files = df_cases_and_files.explode('files')

# Filter files related to miRNA-Seq or RNA-Seq experimental strategies
key = 'experimental_strategy'
df_cases_and_files = df_cases_and_files[
    df_cases_and_files['files'].apply(
        lambda x: (
            key in x and (x[key] == 'miRNA-Seq' or x[key] == 'RNA-Seq')
        )
    )
]

# Explode dictionary contents with file metadata into columns
df_cases_and_files = pd.concat(
    objs=[
        df_cases_and_files.reset_index(drop=True),
        pd.json_normalize(df_cases_and_files['files'])
    ],
    axis='columns'
)

# Create the DataFrame of files of interest
df_files = df_cases_and_files \
    .query(
        '(data_type == "Gene Expression Quantification") ' +
        'or (data_type == "Isoform Expression Quantification")'
    ) \
    .drop(
        columns=[
            'case_disease_type',
            'case_primary_site',
            'experimental_strategy',
            'files',
            'project_id'
        ]
    ) \
    .reset_index(drop=True)

In [14]:
# Print the DataFrame of files of interest
df_files

Unnamed: 0,case_id,submitter_id,data_format,updated_datetime,file_id,data_type,created_datetime
0,4298ccdb-2e6d-4267-822d-75b021364084,TCGA-B0-4710,TXT,2024-07-29T21:20:32.946706-05:00,7ebb3c64-2d8b-4388-8ad8-555fb7cea1b4,Isoform Expression Quantification,2018-03-20T00:16:05.303968-05:00
1,4298ccdb-2e6d-4267-822d-75b021364084,TCGA-B0-4710,TSV,2024-07-30T11:11:25.450940-05:00,6d0df9bf-69d6-447f-b2d9-76391e95d61d,Gene Expression Quantification,2021-12-13T19:31:32.504341-06:00
2,a2663a86-a006-4867-9e88-2b523df48303,TCGA-B8-A54K,TXT,2024-07-29T21:23:47.515560-05:00,9f7e5a27-9a09-474e-a2d6-ed478a124fe3,Isoform Expression Quantification,2018-03-19T23:43:21.888100-05:00
3,a2663a86-a006-4867-9e88-2b523df48303,TCGA-B8-A54K,TSV,2024-07-30T11:09:53.955851-05:00,d1cd1a00-c44a-4581-ad07-e234303a65b9,Gene Expression Quantification,2021-12-13T19:24:52.105085-06:00
4,439794a8-51bd-4c70-968c-34cf26b90148,TCGA-B0-5113,TXT,2024-07-29T21:20:18.451351-05:00,7de693eb-d430-4f0b-94d9-45c13b84724b,Isoform Expression Quantification,2018-03-19T23:58:10.630420-05:00
...,...,...,...,...,...,...,...
22935,818dc159-aba4-46bc-a4ed-68ee0f8c4461,TCGA-25-2042,TXT,2024-07-29T12:19:31.879399-05:00,838fbb4e-4e5e-41b0-8d19-a8ba28c9c503,Isoform Expression Quantification,2018-03-20T06:44:11.472655-05:00
22936,82093ed9-a3c8-4e34-931f-4ec7ae745711,TCGA-13-1512,TXT,2024-07-29T12:28:35.620233-05:00,9e68a248-4c95-4fbe-a0e6-a14aa5d0d199,Isoform Expression Quantification,2018-03-20T06:44:51.672601-05:00
22937,82093ed9-a3c8-4e34-931f-4ec7ae745711,TCGA-13-1512,TSV,2024-07-30T14:06:48.694159-05:00,e50173ef-c0f2-4b59-8ab4-d24997f38c5d,Gene Expression Quantification,2021-12-13T20:53:09.567951-06:00
22938,838693d5-1ae0-4d75-834b-e1b89b96b0ed,TCGA-29-1768,TXT,2024-07-29T11:58:33.147812-05:00,1b3823ea-12bc-4dd7-b941-a42d9a79078c,Isoform Expression Quantification,2018-03-20T07:07:09.485122-05:00


## Files Sample DataFrame

In [15]:
# List with the UUIDs of the files of interest
file_ids = df_files['file_id'].to_list()

# Fields of interest for the files endpoint request
fields = [
    'cases.samples.tissue_type',
    'cases.samples.sample_type'
]

# Filters to be used in the files endpoint request
filters = {
    'op': 'and',
    'content': [
        {
            'op': 'in',
            'content': {
                'field': 'file_id',
                'value': file_ids
            }
        },
        {
            'op': 'in',
            'content': {
                'field': 'cases.samples.sample_type',
                'value': ['Primary Tumor', 'Solid Tissue Normal']
            }
        }
    ]
}

# Get the response from the files endpoint request
files_response = gdc_api_request(
    endpoint=FILES_ENDPOINT,
    fields=fields,
    filters=filters,
    size=len(file_ids)
)

In [16]:
# Create the DataFrame of samples related to the files
df_files_samples = pd.json_normalize(files_response)

# Explode the lists of dictionaries with samples metadata
df_samples = pd.json_normalize(
    pd.json_normalize(
        pd.json_normalize(
            df_files_samples.explode('cases')['cases']
        )['samples']
    )[0]
)

# Concatenate exploded metadata from samples to the files UUIDs
df_files_samples = pd.concat(
    objs=[df_files_samples, df_samples],
    axis='columns'
)

# Rearrange the columns of the DataFrame of samples related to the files
df_files_samples = df_files_samples \
    .rename(columns={'id': 'file_id'}) \
    .drop(columns=['cases'])

In [17]:
# Print the DataFrame of samples related to the files
df_files_samples

Unnamed: 0,file_id,sample_type,tissue_type
0,3267dc27-9c0e-4572-a244-92fb9972ce6f,Primary Tumor,Tumor
1,be6dc660-6d65-443e-9865-432eea2ad1aa,Solid Tissue Normal,Normal
2,3df46342-b06f-46bf-8757-762eec5fee57,Solid Tissue Normal,Normal
3,568f568a-405a-4b79-a52b-a48b712cb4fd,Primary Tumor,Tumor
4,34c2a9e9-4019-48fb-b142-e035ac796e24,Primary Tumor,Tumor
...,...,...,...
21709,c8a68cbe-87a1-44b3-bf4b-68ba0de91994,Primary Tumor,Tumor
21710,5c3ec03d-d9eb-48e8-b132-5733dcf83131,Primary Tumor,Tumor
21711,42afded2-aa23-4f7f-b86f-ac47b324db1d,Primary Tumor,Tumor
21712,90667fe4-7e68-4bf0-b9cf-1b56439f1c3b,Primary Tumor,Tumor


## TCGA Files of Interest DataFrame

In [18]:
# Create the DataFrame of TCGA files of interest with associated samples
df_files = df_files \
    .merge(
        right=df_files_samples,
        left_on='file_id',
        right_on='file_id',
        how='inner'
    ) \
    [[
        'file_id',
        'data_format',
        'data_type',
        'sample_type',
        'tissue_type',
        'created_datetime',
        'updated_datetime',
        'case_id'
    ]]

In [19]:
# Store the DataFrame in a CSV file
file_name = 'tcga-files-of-interest.csv'
df_files.to_csv(f'{INTERIM_DATA_PATH}/{file_name}', index=False)

# Print the DataFrame of TCGA files of interest with associated samples
df_files

Unnamed: 0,file_id,data_format,data_type,sample_type,tissue_type,created_datetime,updated_datetime,case_id
0,7ebb3c64-2d8b-4388-8ad8-555fb7cea1b4,TXT,Isoform Expression Quantification,Primary Tumor,Tumor,2018-03-20T00:16:05.303968-05:00,2024-07-29T21:20:32.946706-05:00,4298ccdb-2e6d-4267-822d-75b021364084
1,6d0df9bf-69d6-447f-b2d9-76391e95d61d,TSV,Gene Expression Quantification,Primary Tumor,Tumor,2021-12-13T19:31:32.504341-06:00,2024-07-30T11:11:25.450940-05:00,4298ccdb-2e6d-4267-822d-75b021364084
2,9f7e5a27-9a09-474e-a2d6-ed478a124fe3,TXT,Isoform Expression Quantification,Primary Tumor,Tumor,2018-03-19T23:43:21.888100-05:00,2024-07-29T21:23:47.515560-05:00,a2663a86-a006-4867-9e88-2b523df48303
3,d1cd1a00-c44a-4581-ad07-e234303a65b9,TSV,Gene Expression Quantification,Primary Tumor,Tumor,2021-12-13T19:24:52.105085-06:00,2024-07-30T11:09:53.955851-05:00,a2663a86-a006-4867-9e88-2b523df48303
4,7de693eb-d430-4f0b-94d9-45c13b84724b,TXT,Isoform Expression Quantification,Primary Tumor,Tumor,2018-03-19T23:58:10.630420-05:00,2024-07-29T21:20:18.451351-05:00,439794a8-51bd-4c70-968c-34cf26b90148
...,...,...,...,...,...,...,...,...
21709,838fbb4e-4e5e-41b0-8d19-a8ba28c9c503,TXT,Isoform Expression Quantification,Primary Tumor,Tumor,2018-03-20T06:44:11.472655-05:00,2024-07-29T12:19:31.879399-05:00,818dc159-aba4-46bc-a4ed-68ee0f8c4461
21710,9e68a248-4c95-4fbe-a0e6-a14aa5d0d199,TXT,Isoform Expression Quantification,Primary Tumor,Tumor,2018-03-20T06:44:51.672601-05:00,2024-07-29T12:28:35.620233-05:00,82093ed9-a3c8-4e34-931f-4ec7ae745711
21711,e50173ef-c0f2-4b59-8ab4-d24997f38c5d,TSV,Gene Expression Quantification,Primary Tumor,Tumor,2021-12-13T20:53:09.567951-06:00,2024-07-30T14:06:48.694159-05:00,82093ed9-a3c8-4e34-931f-4ec7ae745711
21712,1b3823ea-12bc-4dd7-b941-a42d9a79078c,TXT,Isoform Expression Quantification,Primary Tumor,Tumor,2018-03-20T07:07:09.485122-05:00,2024-07-29T11:58:33.147812-05:00,838693d5-1ae0-4d75-834b-e1b89b96b0ed
