# **Preprocessing Metadata from GDC API**
GDC: Genomic Data Commons

# Importing Libraries and Configurations

In [1]:
import json
import os
import sys

from urllib.parse import urljoin

import pandas as pd
import requests

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    GDC_API_ENDPOINTS,
    GDC_INTERIM_FILE_PATHS,
)

# API Status

In [2]:
# Request the API version and status to the endpoint
response = requests.get(GDC_API_ENDPOINTS['status'])

# Print the response content
print(response.content.decode('utf-8'))

{"commit":"4bb408881e6dc67eca93ff9fd913629a8f2d11c2","data_release":"Data Release 42.0 - January 30, 2025","data_release_version":{"major":42,"minor":0,"release_date":"2025-01-30"},"status":"OK","tag":"7.8.5","version":1}



# Projects

## All Projects
Total projects contained in the GDC API

In [3]:
# Request the information contained in the endpoint
response = requests.get(urljoin(GDC_API_ENDPOINTS['projects'] + '/', '_mapping'))

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Store the fields available in the endpoint
fields_projects = json.dumps(json_response['fields'])[1:-2]
fields_projects = fields_projects.replace('"', '')
fields_projects = fields_projects.replace(' ', '')

In [4]:
# Request the default number of objects to the endpoint
response = requests.post(
    url=GDC_API_ENDPOINTS['projects'],
    headers={'Content-Type': 'application/json'},
    json={'fields': 'name'}
)

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))

# Store the total number of projects contained in the GDC API
total_projects = json.dumps(json_response['data']['pagination']['total'])
print(f'Total number of projects: {total_projects}')

Total number of projects: 86


In [5]:
# Parameters for the endpoint request
params = {
    'fields': fields_projects,
    'size': total_projects
}

# Request all objects from the endpoint
response = requests.post(
    url=GDC_API_ENDPOINTS['projects'],
    headers={'Content-Type': 'application/json'},
    json=params
)

# Convert the response content to JSON
json_response = json.loads(response.content.decode('utf-8'))['data']['hits']

# Convert JSON boolean values to strings
for index in range(len(json_response)):
    obj = json_response[index]
    for key in obj:
        if isinstance(obj[key], bool): 
            obj[key] = str(obj[key])

# Create a DataFrame from the projects JSON
df_all_projects = pd.json_normalize(json_response) \
    .sort_values(by='id') \
    .reset_index(drop=True)

In [6]:
# Write the DataFrame in a CSV file
df_all_projects.to_csv(GDC_INTERIM_FILE_PATHS['all-projects'], index=False)

# Print the DataFrame partially
pd.set_option('display.max_colwidth', 100)
df_all_projects.head()

Unnamed: 0,id,primary_site,dbgap_accession_number,project_id,disease_type,name,releasable,state,released,summary.file_count,summary.data_categories,summary.experimental_strategies,summary.case_count,summary.file_size,program.dbgap_accession_number,program.program_id,program.name
0,APOLLO-LUAD,[Bronchus and lung],phs003011,APOLLO-LUAD,[Adenomas and Adenocarcinomas],APOLLO1: Proteogenomic characterization of lung adenocarcinoma,False,open,True,1679,"[{'file_count': 435, 'case_count': 87, 'data_category': 'Sequencing Reads'}, {'file_count': 194,...","[{'file_count': 783, 'case_count': 87, 'experimental_strategy': 'RNA-Seq'}, {'file_count': 896, ...",87,67389870223275,,bd5bea9d-29ff-52e4-92f5-463837c6f6cf,APOLLO
1,BEATAML1.0-COHORT,[Hematopoietic and reticuloendothelial systems],phs001657,BEATAML1.0-COHORT,"[Plasma Cell Tumors, Myelodysplastic Syndromes, Unknown, Chronic Myeloproliferative Disorders, M...",Functional Genomic Landscape of Acute Myeloid Leukemia,True,open,True,16794,"[{'file_count': 8777, 'case_count': 759, 'data_category': 'Simple Nucleotide Variation'}, {'file...","[{'file_count': 7478, 'case_count': 534, 'experimental_strategy': 'WXS'}, {'file_count': 2653, '...",826,42355372208215,,1eaac0a3-91f6-54b3-9a7c-e34468eaf527,BEATAML1.0
2,BEATAML1.0-CRENOLANIB,[Hematopoietic and reticuloendothelial systems],phs001628,BEATAML1.0-CRENOLANIB,[Myeloid Leukemias],Clinical Resistance to Crenolanib in Acute Myeloid Leukemia Due to Diverse Molecular Mechanisms,False,open,True,547,"[{'file_count': 428, 'case_count': 51, 'data_category': 'Simple Nucleotide Variation'}, {'file_c...","[{'file_count': 547, 'case_count': 56, 'experimental_strategy': 'WXS'}]",56,3608907348655,,1eaac0a3-91f6-54b3-9a7c-e34468eaf527,BEATAML1.0
3,CDDP_EAGLE-1,[Bronchus and lung],phs001239,CDDP_EAGLE-1,[Adenomas and Adenocarcinomas],CDDP Integrative Analysis of Lung Adenocarcinoma (Phase 2),True,open,True,1796,"[{'file_count': 788, 'case_count': 50, 'data_category': 'Simple Nucleotide Variation'}, {'file_c...","[{'file_count': 800, 'case_count': 50, 'experimental_strategy': 'WXS'}, {'file_count': 432, 'cas...",50,17540184214951,,8e1f8ca7-0a89-5f0e-ad9d-67ee0c0c9bdb,CDDP_EAGLE
4,CGCI-BLGSP,[Hematopoietic and reticuloendothelial systems],phs000527,CGCI-BLGSP,[Mature B-Cell Lymphomas],Burkitt Lymphoma Genome Sequencing Project,True,submitted,True,11236,"[{'file_count': 2031, 'case_count': 262, 'data_category': 'Simple Nucleotide Variation'}, {'file...","[{'file_count': 1847, 'case_count': 224, 'experimental_strategy': 'Targeted Sequencing'}, {'file...",324,168020390722014,phs000235,d339a424-ad52-5351-a499-493973249414,CGCI


## Projects of Interest
Projects Released with miRNA-Seq and RNA-Seq

In [7]:
# Filter released projects and drop unnecessary columns
df_projects = df_all_projects \
    .query('released == "True"') \
    .drop(
        columns=[
            'dbgap_accession_number',
            'id',
            'releasable',
            'released',
            'program.dbgap_accession_number',
            'program.name',
            'program.program_id',
            'state',
            'summary.data_categories',
            'summary.file_count',
            'summary.file_size'
        ]
    )

# Initialize lists for counting cases with miRNA-Seq and RNA-Seq in projects
miRNA_Seq_case_count = [0] * df_projects.shape[0]
RNA_Seq_case_count = [0] * df_projects.shape[0]

# Fill the lists with counts of cases with miRNA-Seq and RNA-Seq in each project
for index in range(df_projects.shape[0]):
    for info in df_projects['summary.experimental_strategies'][index]:
        if info['experimental_strategy'] in ['miRNA-Seq', 'RNA-Seq']:
            if info['experimental_strategy'] == 'miRNA-Seq':
                miRNA_Seq_case_count[index] = info['case_count']
            else:
                RNA_Seq_case_count[index] = info['case_count']

# Add the count of cases with miRNA-Seq and RNA-Seq to the DataFrame
df_projects['miRNA-Seq_case_count'] = miRNA_Seq_case_count
df_projects['RNA-Seq_case_count'] = RNA_Seq_case_count

# Filter projects with miRNA-Seq and RNA-Seq as experimental strategy
df_projects = df_projects \
    .query('(`miRNA-Seq_case_count` > 0) & (`RNA-Seq_case_count` > 0)') \
    .drop(columns='summary.experimental_strategies') \
    .reset_index(drop=True)

# Rearrange and rename the DataFrame columns
columns = [
    'project_id',
    'name',
    'primary_site',
    'disease_type',
    'summary.case_count',
    'miRNA-Seq_case_count',
    'RNA-Seq_case_count'
]
df_projects = df_projects[columns] \
    .rename(
        columns={
            'name': 'project_name',
            'summary.case_count': 'case_count',
        }
    )

# Print the total number of projects with the desired characteristics
print(f'Total number of projects of interest: {df_projects.shape[0]}')

Total number of projects of interest: 47


In [8]:
# Write the DataFrame in a CSV file
df_projects.to_csv(GDC_INTERIM_FILE_PATHS['projects'], index=False)

# Print the DataFrame partially
pd.set_option('display.max_colwidth', 100)
df_projects.head()

Unnamed: 0,project_id,project_name,primary_site,disease_type,case_count,miRNA-Seq_case_count,RNA-Seq_case_count
0,CGCI-BLGSP,Burkitt Lymphoma Genome Sequencing Project,[Hematopoietic and reticuloendothelial systems],[Mature B-Cell Lymphomas],324,278,283
1,CGCI-HTMCP-CC,HIV+ Tumor Molecular Characterization Project - Cervical Cancer,[Cervix uteri],"[Adenomas and Adenocarcinomas, Squamous Cell Neoplasms, Epithelial Neoplasms, NOS, Complex Epith...",212,118,123
2,CGCI-HTMCP-DLBCL,HIV+ Tumor Molecular Characterization Project - Diffuse Large B-Cell Lymphoma,[Hematopoietic and reticuloendothelial systems],[Mature B-Cell Lymphomas],70,60,55
3,CGCI-HTMCP-LC,HIV+ Tumor Molecular Characterization Project - Lung Cancer,[Bronchus and lung],"[Paragangliomas and Glomus Tumors, Squamous Cell Neoplasms, Adenomas and Adenocarcinomas, Epithe...",39,37,38
4,CPTAC-2,"CPTAC-Breast, Colon, Ovary","[Colon, Ovary, Rectum, Other and unspecified female genital organs, Retroperitoneum and peritone...","[Ductal and Lobular Neoplasms, Cystic, Mucinous and Serous Neoplasms, Squamous Cell Neoplasms, N...",342,310,340


# Cases and Files

## Cases of Interest 
Cases with miRNA-Seq and RNA-Seq

In [9]:
# Fields of interest for the endpoint request
fields = [
    'disease_type',
    'files.access',
    'files.data_category',
    'files.data_format',
    'files.data_type',
    'files.file_id',
    'files.experimental_strategy',
    'primary_site',
    'project.project_id',
    'submitter_id'
]
fields = ','.join(fields)

# Initialize the DataFrame of cases and files
df_cases_and_files = pd.DataFrame()

# Search for cases with miRNA-Seq and RNA-Seq in each project of interest
for index in range(df_projects.shape[0]):
    # Filters used in the endpoint request
    filters = {
        'op': 'and',
        'content': [
            {
                'op': '=',
                'content': {
                    'field': 'project.project_id',
                    'value': df_projects['project_id'][index]
                }
            },
            {
                'op': 'in',
                'content': {
                    'field': 'files.experimental_strategy',
                    'value': ['miRNA-Seq', 'RNA-Seq']
                }
            }
        ]
    }

    # Parameters for the endpoint request
    params = {
        'fields': fields,
        'filters': filters,
        'size': str(df_projects['case_count'][index])
    }

    # Request all project objects to the endpoint
    response = requests.post(
        url=GDC_API_ENDPOINTS['cases'],
        headers={'Content-Type': 'application/json'},
        json=params
    )

    # Transform the response content to a DataFrame
    json_response = json.loads(response.content.decode('utf-8'))
    df_project_cases = pd.json_normalize(json_response['data']['hits'])

    # Concatenate the cases of this project with the others
    if df_cases_and_files.empty == False:
        df_cases_and_files = pd.concat(
            [df_cases_and_files, df_project_cases], ignore_index=True
        )
    else:
        df_cases_and_files = df_project_cases.copy()

# Rename some columns of the DataFrame
df_cases_and_files = df_cases_and_files.rename(
    columns={'id': 'case_id', 'project.project_id': 'project_id'}
)

In [10]:
# Create the cases DataFrame
df_cases = df_cases_and_files.drop(columns='files')

# Write the DataFrame in a CSV file
df_cases.to_csv(GDC_INTERIM_FILE_PATHS['cases'], index=False)

# Print the DataFrame
df_cases

Unnamed: 0,case_id,primary_site,disease_type,submitter_id,project_id
0,cbb553da-c61c-4734-ac58-ef4580aaef47,Hematopoietic and reticuloendothelial systems,Mature B-Cell Lymphomas,BLGSP-71-30-00668,CGCI-BLGSP
1,681ff2e8-6073-4dab-a398-1f2203c2d7f5,Hematopoietic and reticuloendothelial systems,Mature B-Cell Lymphomas,BLGSP-71-06-00254,CGCI-BLGSP
2,886e1609-e89c-4d5e-ab9d-4f9ae18c937a,Hematopoietic and reticuloendothelial systems,Mature B-Cell Lymphomas,BLGSP-71-06-00097,CGCI-BLGSP
3,fdc9dadf-33c7-4b64-8bad-d6bb56fb2209,Hematopoietic and reticuloendothelial systems,Mature B-Cell Lymphomas,BLGSP-71-08-00213,CGCI-BLGSP
4,edf2e5ba-5702-4f96-97f7-124d70bb16aa,Hematopoietic and reticuloendothelial systems,Mature B-Cell Lymphomas,BLGSP-71-19-00128,CGCI-BLGSP
...,...,...,...,...,...
16785,e21d8018-8fe9-4c92-8b36-28d7d3f7df2b,Eye and adnexa,Nevi and Melanomas,TCGA-V4-A9F3,TCGA-UVM
16786,ebdabfcb-1c68-4ca1-8b7f-3ea9cdac3a6a,Eye and adnexa,Nevi and Melanomas,TCGA-YZ-A985,TCGA-UVM
16787,edbf6720-467e-4928-b1cb-0e7336358d18,Eye and adnexa,Nevi and Melanomas,TCGA-V4-A9EE,TCGA-UVM
16788,eeab0529-31e6-4edb-bd2c-a21f2a169381,Eye and adnexa,Nevi and Melanomas,TCGA-V4-A9EC,TCGA-UVM


## Files of Interest 
Files from miRNA-Seq and RNA-Seq

In [11]:
# Explode the lists dictionary over case files
df_cases_and_files = df_cases_and_files.explode('files')

# Filter case files related to miRNA-Seq or RNA-Seq
key = 'experimental_strategy'
df_cases_and_files = df_cases_and_files[
    df_cases_and_files['files'].apply(
        lambda x: (
            key in x and (x[key] == 'miRNA-Seq' or x[key] == 'RNA-Seq')
        )
    )
]

# Create the files DataFrame
df_files = pd.concat(
    objs=[
        df_cases_and_files.reset_index(drop=True),
        pd.json_normalize(df_cases_and_files['files'])
    ],
    axis='columns'
)
df_files = df_files.drop(
    columns=['disease_type', 'files', 'primary_site', 'project_id', 'submitter_id']
)

In [12]:
# Write the DataFrame in a CSV file
df_files.to_csv(GDC_INTERIM_FILE_PATHS['files'], index=False)

# Print the DataFrame
df_files

Unnamed: 0,case_id,data_format,access,file_id,data_type,data_category,experimental_strategy
0,cbb553da-c61c-4734-ac58-ef4580aaef47,TSV,controlled,7fad3cc2-cc77-4f0f-b63c-a94a274f10e2,Transcript Fusion,Structural Variation,RNA-Seq
1,cbb553da-c61c-4734-ac58-ef4580aaef47,TSV,open,030842d9-69b2-4938-9475-31c43cce6f7a,Gene Expression Quantification,Transcriptome Profiling,RNA-Seq
2,cbb553da-c61c-4734-ac58-ef4580aaef47,TSV,controlled,862a63ae-d83e-431d-94fb-613570e01477,Splice Junction Quantification,Transcriptome Profiling,RNA-Seq
3,cbb553da-c61c-4734-ac58-ef4580aaef47,TSV,controlled,44816e46-1474-4c6c-a4c8-c2cee21329d0,Transcript Fusion,Structural Variation,RNA-Seq
4,cbb553da-c61c-4734-ac58-ef4580aaef47,BEDPE,controlled,a93709d9-9a6f-4ceb-aeca-a9afd48c13b1,Transcript Fusion,Structural Variation,RNA-Seq
...,...,...,...,...,...,...,...
232548,f78d8541-b317-4c8a-aa66-f172a8eaf14f,BEDPE,controlled,8a7a812c-cb90-492a-982f-f27e3ce39794,Transcript Fusion,Structural Variation,RNA-Seq
232549,f78d8541-b317-4c8a-aa66-f172a8eaf14f,TSV,controlled,c948d42f-8eb8-4425-b8dd-27b973ccc6b7,Splice Junction Quantification,Transcriptome Profiling,RNA-Seq
232550,f78d8541-b317-4c8a-aa66-f172a8eaf14f,BAM,controlled,c5a9c2d9-1971-4eaf-923e-f2d1df90fdcb,Aligned Reads,Sequencing Reads,RNA-Seq
232551,f78d8541-b317-4c8a-aa66-f172a8eaf14f,TSV,open,e1849623-895a-4981-b8c1-25ad0ec1143b,Gene Expression Quantification,Transcriptome Profiling,RNA-Seq
