# **Preprocessing Metadata from TCGA-BRCA Project**
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import json
import os
import sys

import numpy as np
import pandas as pd
import requests

# Get the project root (two levels above)
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    BRCA_INTERIM_DATA_DIR,
    BRCA_PAPER_FILE_PATH,
    GDC_API_ENDPOINTS,
    GDC_API_REQUESTS_CONSTANTS,
)

# Function

In [2]:
def gdc_api_request(endpoint, fields, filters={}, size=1):
    """
    Send a request to a Genomic Data Commons (GDC) API endpoint with the specified parameters.

    Parameters:
    -----------
    endpoint : str
        The URL of the endpoint to send the request to.
    fields : list of str
        A list of fields to include in the response.
    filters : dict, optional
        A dictionary of filters to apply to the request. Default is an empty dictionary.
    size : int, optional
        The number of results to retrieve. Default is 1.

    Returns:
    --------
    list
        A list of hits (data items) retrieved from the endpoint response.
    """
    # Parameters used in the endpoint request
    params = {
        'fields': ','.join(fields),
        'filters': filters,
        'size': str(size)
    }
    
    # Request the objects of interest to the endpoint
    response = requests.post(
        url=endpoint,
        headers={'Content-Type': 'application/json'},
        json=params
    )
    response.raise_for_status()

    # Convert the response content to a JSON
    json_response = json.loads(response.content.decode('utf-8'))

    return json_response['data']['hits']

# TCGA-BRCA Paper - Supplementary Table 1
The Cancer Genome Atlas Network. Comprehensive molecular portraits of human breast tumours. Nature 490, 61–70 (2012). https://doi.org/10.1038/nature11412.

## Loading and Filtering Data

In [3]:
# Create the DataFrame of 'Supplementary Table 1' from the TCGA-BRCA paper
df_paper_data = pd.read_excel(
    BRCA_PAPER_FILE_PATH, sheet_name='SuppTable1', skiprows=1
)

# Rename DataFrame columns
columns = dict()
for name in list(df_paper_data.columns):
    columns[name] = name.replace(' ', '_').lower()
df_paper_data = df_paper_data.rename(columns=columns)

# Filter cases of molecular subtypes of interest
df_paper_cases = df_paper_data \
    .query(
        '(pam50_mrna == "Basal-like") or (pam50_mrna == "HER2-enriched") ' +
        'or (pam50_mrna == "Luminal A") or (pam50_mrna == "Luminal B")'
    )
    
# # Flags
# df_paper_data['is_case_of_interest'] = np.where(
#     ((df_paper_data['pam50_mrna'] == 'Basal-like') 
#      | (df_paper_data['pam50_mrna'] == 'HER2-enriched') 
#      | (df_paper_data['pam50_mrna'] == 'Luminal A') 
#      | (df_paper_data['pam50_mrna'] == 'Luminal B')), 1, 0
# )

## Paper Cases DataFrame

In [4]:
# Store the DataFrame in a CSV file
file_name = 'brca-paper-cases.csv'
df_paper_cases.to_csv(f'{BRCA_INTERIM_DATA_DIR}/{file_name}', index=False)

In [5]:
# Print the DataFrame of cases of interest from the TCGA-BRCA paper
df_paper_cases

Unnamed: 0,complete_tcga_id,gender,age_at_initial_pathologic_diagnosis,er_status,pr_status,her2_final_status,tumor,tumor--t1_coded,node,node-coded,...,pam50_mrna,sigclust_unsupervised_mrna,sigclust_intrinsic_mrna,mirna_clusters,methylation_clusters,rppa_clusters,cn_clusters,integrated_clusters_(with_pam50),integrated_clusters_(no_exp),integrated_clusters_(unsup_exp)
0,TCGA-A2-A0T2,FEMALE,66.0,Negative,Negative,Negative,T3,T_Other,N3,Positive,...,Basal-like,0.0,-13.0,3.0,5.0,Basal,3.0,2.0,2.0,2.0
1,TCGA-A2-A04P,FEMALE,36.0,Negative,Negative,Negative,T2,T_Other,N3,Positive,...,Basal-like,0.0,-13.0,5.0,5.0,Basal,1.0,2.0,2.0,2.0
2,TCGA-A1-A0SK,FEMALE,54.0,Negative,Negative,Negative,T2,T_Other,N0,Negative,...,Basal-like,-6.0,-13.0,5.0,5.0,Basal,1.0,2.0,2.0,2.0
3,TCGA-A2-A0CM,FEMALE,40.0,Negative,Negative,Negative,T2,T_Other,N0,Negative,...,Basal-like,-12.0,-13.0,4.0,4.0,Basal,4.0,2.0,1.0,1.0
4,TCGA-AR-A1AR,FEMALE,50.0,Negative,Negative,Negative,T1,T1,N2,Positive,...,Basal-like,0.0,-13.0,5.0,5.0,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,TCGA-E2-A15K,FEMALE,58.0,Positive,Positive,Negative,T2,T_Other,N1,Positive,...,Luminal B,-3.0,-12.0,4.0,2.0,LumA,2.0,4.0,3.0,3.0
510,TCGA-E2-A15L,FEMALE,65.0,Positive,Positive,Negative,T2,T_Other,N0,Negative,...,Luminal B,-3.0,-12.0,,2.0,ReacI,2.0,,,
511,TCGA-E2-A15M,FEMALE,66.0,Positive,Positive,Negative,T2,T_Other,N0,Negative,...,Luminal B,-4.0,-4.0,6.0,1.0,ReacI,3.0,3.0,3.0,4.0
512,TCGA-E2-A15S,FEMALE,34.0,Positive,Negative,Negative,T2,T_Other,N1,Positive,...,Luminal B,-3.0,-10.0,6.0,4.0,LumA/B,,,,


# GDC API Metadata
GDC: Genomic Data Commons

## Status

In [6]:
# Request the API version and status to the endpoint
response = requests.get(GDC_API_ENDPOINTS['status'])

# Print the response content
try:
    parsed = json.loads(response.content.decode('utf-8'))
    print(json.dumps(parsed, indent=4))
except json.JSONDecodeError:
    print('Response is not valid JSON:')
    print(response.content.decode('utf-8'))

{
    "commit": "4bb408881e6dc67eca93ff9fd913629a8f2d11c2",
    "data_release": "Data Release 42.0 - January 30, 2025",
    "data_release_version": {
        "major": 42,
        "minor": 0,
        "release_date": "2025-01-30"
    },
    "status": "OK",
    "tag": "7.8.5",
    "version": 1
}


## Projects

### Endpoint Response

In [7]:
# Fields available in the endpoint
fields = [
    'disease_type',
    'name',
    'primary_site',
    'summary.case_count',
    'summary.data_categories.case_count',
    'summary.data_categories.data_category',
    'summary.data_categories.file_count',
    'summary.experimental_strategies.case_count',
    'summary.experimental_strategies.experimental_strategy',
    'summary.experimental_strategies.file_count',
    'summary.file_count',
]

# Filter to be used in the projects endpoint request
filters = {
    'op': 'in',
    'content': {
        'field': 'projects.project_id',
        'value': GDC_API_REQUESTS_CONSTANTS['projects']
    }
}

# Get the response from the projects endpoint request
projects_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['projects'],
    fields=fields,
    filters=filters,
    size=len(GDC_API_REQUESTS_CONSTANTS['projects'])
)

### Project DataFrame

In [8]:
# Create the TCGA-BRCA project DataFrame
df_project = pd.json_normalize(projects_response) \
    .rename(
        columns={
            'id': 'project_id',
            'name': 'project_name',
            'summary.case_count': 'case_count',
            'summary.data_categories': 'data_categories',
            'summary.experimental_strategies': 'experimental_strategies',
            'summary.file_count': 'file_count',
        }
    ) \
    [[
        'project_id',
        'project_name',
        'primary_site',
        'disease_type',
        'case_count',
        'file_count',
        'experimental_strategies',
        'data_categories',
    ]]

# Store the DataFrame in a CSV file
file_name = 'brca-api-project.csv'
df_project.to_csv(f'{BRCA_INTERIM_DATA_DIR}/{file_name}', index=False)

In [9]:
# Print the TCGA-BRCA project DataFrame
pd.set_option('display.max_colwidth', 900)
df_project

Unnamed: 0,project_id,project_name,primary_site,disease_type,case_count,file_count,experimental_strategies,data_categories
0,TCGA-BRCA,Breast Invasive Carcinoma,[Breast],"[Epithelial Neoplasms, NOS, Adnexal and Skin Appendage Neoplasms, Squamous Cell Neoplasms, Adenomas and Adenocarcinomas, Complex Epithelial Neoplasms, Fibroepithelial Neoplasms, Cystic, Mucinous and Serous Neoplasms, Basal Cell Neoplasms, Ductal and Lobular Neoplasms]",1098,68962,"[{'file_count': 11079, 'case_count': 1095, 'experimental_strategy': 'RNA-Seq'}, {'file_count': 17049, 'case_count': 1072, 'experimental_strategy': 'WXS'}, {'file_count': 3621, 'case_count': 1079, 'experimental_strategy': 'miRNA-Seq'}, {'file_count': 10572, 'case_count': 952, 'experimental_strategy': 'WGS'}, {'file_count': 75, 'case_count': 74, 'experimental_strategy': 'ATAC-Seq'}, {'file_count': 14329, 'case_count': 1098, 'experimental_strategy': 'Genotyping Array'}, {'file_count': 3714, 'case_count': 1097, 'experimental_strategy': 'Methylation Array'}, {'file_count': 919, 'case_count': 881, 'experimental_strategy': 'Reverse Phase Protein Array'}, {'file_count': 1133, 'case_count': 1062, 'experimental_strategy': 'Diagnostic Slide'}, {'file_count': 1978, 'case_count': 1093, 'experimental_strategy': 'Tissue Slide'}]","[{'file_count': 19753, 'case_count': 1098, 'data_category': 'Simple Nucleotide Variation'}, {'file_count': 9282, 'case_count': 1098, 'data_category': 'Sequencing Reads'}, {'file_count': 5316, 'case_count': 1098, 'data_category': 'Biospecimen'}, {'file_count': 2288, 'case_count': 1098, 'data_category': 'Clinical'}, {'file_count': 14346, 'case_count': 1098, 'data_category': 'Copy Number Variation'}, {'file_count': 4876, 'case_count': 1097, 'data_category': 'Transcriptome Profiling'}, {'file_count': 3714, 'case_count': 1097, 'data_category': 'DNA Methylation'}, {'file_count': 919, 'case_count': 881, 'data_category': 'Proteome Profiling'}, {'file_count': 2696, 'case_count': 784, 'data_category': 'Somatic Structural Variation'}, {'file_count': 5772, 'case_count': 1098, 'data_category': 'Structural Variation'}]"


## Cases

### Endpoint Response

In [10]:
# List with the submitter ids of the cases of interest
submitter_ids = df_paper_cases['complete_tcga_id'].to_list()

# Fields of interest for the cases endpoint request
fields = [
    'disease_type',
    'files.data_format',
    'files.data_type',
    'files.file_id',
    'files.experimental_strategy',
    'submitter_id',
]

# Filters to be used in the cases endpoint request
filters = {
    'op': 'and',
    'content': [
        {
            'op': 'in',
            'content': {
                'field': 'project.project_id',
                'value': GDC_API_REQUESTS_CONSTANTS['projects']
            }
        },
        {
            'op': 'in',
            'content': {
                'field': 'disease_type',
                'value': GDC_API_REQUESTS_CONSTANTS['disease_types']
            }  
        },
        {
            'op': 'in',
            'content': {
                'field': 'submitter_id',
                'value': submitter_ids
            }
        }
    ]
}

# Get the response from the cases endpoint request
cases_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['cases'],
    fields=fields,
    filters=filters,
    size=len(submitter_ids)
)

### Cases of Interest DataFrame

In [11]:
# Create the DataFrame of TCGA-BRCA cases of interest with associated files
df_api_cases = pd.json_normalize(cases_response) \
    .rename(columns={'id': 'case_id'}) \
    [['case_id', 'submitter_id', 'disease_type', 'files']]

# Retrieve the molecular subtype classification of cases
df_api_cases = df_api_cases \
    .merge(
        right=df_paper_cases,
        left_on='submitter_id',
        right_on='complete_tcga_id',
        how='inner'
    ) \
    [['case_id', 'submitter_id', 'disease_type', 'pam50_mrna', 'files']]

In [12]:
# Print the DataFrame of TCGA-BRCA cases of interest with associated files
pd.set_option('display.max_colwidth', 200)
df_api_cases

Unnamed: 0,case_id,submitter_id,disease_type,pam50_mrna,files
0,de75d0b9-0f47-4732-8df5-05c350cfcd32,TCGA-AN-A0FY,Ductal and Lobular Neoplasms,Luminal B,"[{'data_format': 'VCF', 'file_id': '9fe6ffdd-7490-452e-9a68-accf26dda31d', 'data_type': 'Annotated Somatic Mutation', 'experimental_strategy': 'WXS'}, {'data_format': 'BEDPE', 'file_id': 'da517742..."
1,1fef9454-b3e8-4d9d-a9aa-aa1f1a32b80c,TCGA-E2-A15J,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BCR Biotab', 'file_id': '0a40467f-9495-4c5b-b56e-3347a3ee0572', 'data_type': 'Clinical Supplement'}, {'data_format': 'VCF', 'file_id': '9e8b82c0-9bac-4489-b32e-faf591e01f86', 'da..."
2,2021ed1f-dc75-4701-b8b8-1386466e4802,TCGA-BH-A0H7,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BAM', 'file_id': 'aba03be3-9b5a-44b3-97a7-d89c37c8b721', 'data_type': 'Aligned Reads', 'experimental_strategy': 'RNA-Seq'}, {'data_format': 'TXT', 'file_id': '70a0fb72-e7ca-4fe5-..."
3,20e8106b-1290-4735-abe4-7621e08e3dc8,TCGA-BH-A0E0,Ductal and Lobular Neoplasms,Basal-like,"[{'data_format': 'VCF', 'file_id': '7dc9bd4d-3ee6-42af-8b44-fdd29978b769', 'data_type': 'Annotated Somatic Mutation', 'experimental_strategy': 'WGS'}, {'data_format': 'TSV', 'file_id': 'ba1d64b0-3..."
4,214a4507-d974-4b3e-8525-7408fccc6a0f,TCGA-BH-A0B7,Ductal and Lobular Neoplasms,HER2-enriched,"[{'data_format': 'BEDPE', 'file_id': 'a1a6f518-7460-42cb-b32e-350b0398cfdc', 'data_type': 'Structural Rearrangement', 'experimental_strategy': 'WGS'}, {'data_format': 'TSV', 'file_id': 'f50fdcd8-7..."
...,...,...,...,...,...
500,33919e92-4ea9-47e5-a6f4-36e51845d50f,TCGA-A8-A09N,Ductal and Lobular Neoplasms,Luminal B,"[{'data_format': 'TAR', 'file_id': '15fc9a4b-e7d4-4426-b1db-76c714a0ed5e', 'data_type': 'Intermediate Analysis Archive', 'experimental_strategy': 'WGS'}, {'data_format': 'VCF', 'file_id': '9c9b561..."
501,34ab0cb8-bf19-4cbb-b4eb-3c45ee5aa60b,TCGA-B6-A0IG,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'TXT', 'file_id': '54d189b0-90e9-4770-8983-b9ed9ccb9d7d', 'data_type': 'Allele-specific Copy Number Segment', 'experimental_strategy': 'Genotyping Array'}, {'data_format': 'TAR', ..."
502,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,TCGA-BH-A0AU,Ductal and Lobular Neoplasms,Luminal B,"[{'data_format': 'VCF', 'file_id': 'ec73e799-e6d8-44b4-b47c-abfe573c8f39', 'data_type': 'Structural Rearrangement', 'experimental_strategy': 'WGS'}, {'data_format': 'PDF', 'file_id': 'fb2fcbdf-cc6..."
503,2cf68894-168b-458b-af4f-53cad72989a8,TCGA-A7-A13F,Ductal and Lobular Neoplasms,Luminal B,"[{'data_format': 'BEDPE', 'file_id': '48327159-a47a-4c5f-8434-3a51edeac2cc', 'data_type': 'Structural Rearrangement', 'experimental_strategy': 'WGS'}, {'data_format': 'TXT', 'file_id': '3acfdb09-8..."


## Files

### Expression Quantification Files

In [13]:
# Explode lists with dictionaries containing case files metadata 
df_eq_files = df_api_cases.explode('files')

# Filter files related to miRNA-Seq or RNA-Seq experimental strategies
key = 'experimental_strategy'
df_eq_files = df_eq_files[
    df_eq_files['files'].apply(
        lambda x: (
            key in x and (x[key] == 'miRNA-Seq' or x[key] == 'RNA-Seq')
        )
    )
]

# Explode dictionary contents with file metadata into columns
df_eq_files = pd.concat(
    objs=[
        df_eq_files.reset_index(drop=True),
        pd.json_normalize(df_eq_files['files'])
    ],
    axis='columns'
)

# Filter isoform (miRNA-Seq) and gene (RNA-Seq) expression quantification files
df_eq_files = df_eq_files \
    .query(
        '(data_type == "Isoform Expression Quantification") ' +
        'or (data_type == "Gene Expression Quantification")'
    ) \
    .reset_index(drop=True) \
    [['file_id', 'case_id', 'experimental_strategy', 'data_type', 'data_format']]

In [14]:
# Print the DataFrame of TCGA-BRCA expression quantification files
df_eq_files

Unnamed: 0,file_id,case_id,experimental_strategy,data_type,data_format
0,e9fe2319-2624-4ac1-9c41-d0ac44608f02,de75d0b9-0f47-4732-8df5-05c350cfcd32,RNA-Seq,Gene Expression Quantification,TSV
1,0a8016e1-0321-4421-a5ad-db2fd9caf7ab,de75d0b9-0f47-4732-8df5-05c350cfcd32,miRNA-Seq,Isoform Expression Quantification,TXT
2,8437bf97-2354-40cf-8dc4-631c5d7ae63a,1fef9454-b3e8-4d9d-a9aa-aa1f1a32b80c,RNA-Seq,Gene Expression Quantification,TSV
3,7a4675e7-4174-4561-ae9c-b7636fbea718,1fef9454-b3e8-4d9d-a9aa-aa1f1a32b80c,miRNA-Seq,Isoform Expression Quantification,TXT
4,ea1fadc2-1cdc-4658-9619-eeb26ae09da8,2021ed1f-dc75-4701-b8b8-1386466e4802,RNA-Seq,Gene Expression Quantification,TSV
...,...,...,...,...,...
1130,7c275803-bb6d-4fbc-b851-5fcb393f8496,2cf68894-168b-458b-af4f-53cad72989a8,miRNA-Seq,Isoform Expression Quantification,TXT
1131,9c87ef50-5854-4f69-820b-9e0b4d46c982,2cf68894-168b-458b-af4f-53cad72989a8,RNA-Seq,Gene Expression Quantification,TSV
1132,8ca0e4c1-92ab-4653-87e0-cdcc630d8db6,2cf68894-168b-458b-af4f-53cad72989a8,miRNA-Seq,Isoform Expression Quantification,TXT
1133,d5d23d6a-c141-442f-ac4f-714e46b8d9ef,36d77496-99f8-4911-ba7c-89fae32662aa,RNA-Seq,Gene Expression Quantification,TSV


### Samples Related to Files

In [15]:
# List with the UUIDs of the files of interest
file_ids = df_eq_files['file_id'].to_list()

# Fields of interest for the files endpoint request
fields = ['cases.samples.tissue_type', 'cases.samples.sample_type']

# Filters to be used in the files endpoint request
filters = {
    'op': 'and',
    'content': [
        {
            'op': 'in',
            'content': {
                'field': 'cases.samples.sample_type',
                'value': GDC_API_REQUESTS_CONSTANTS['sample_types']
            }
        },
        {
            'op': 'in',
            'content': {
                'field': 'file_id',
                'value': file_ids
            }
        }
    ]
}

# Get the response from the files endpoint request
files_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['files'],
    fields=fields,
    filters=filters,
    size=len(file_ids)
)

In [16]:
# Create the DataFrame of samples related to files
df_files_samples = pd.json_normalize(files_response)

# Explode the lists of dictionaries with samples metadata
df_samples = pd.json_normalize(
    pd.json_normalize(
        pd.json_normalize(
            df_files_samples.explode('cases')['cases']
        )['samples']
    )[0]
)

# Concatenate exploded metadata from samples to the files UUIDs
df_files_samples = pd.concat(
    objs=[df_files_samples, df_samples],
    axis='columns'
)

# Rearrange the columns of the DataFrame
df_files_samples = df_files_samples \
    .rename(columns={'id': 'file_id'}) \
    [['file_id', 'tissue_type', 'sample_type']]

In [17]:
# Print the DataFrame of samples related to files
df_files_samples

Unnamed: 0,file_id,tissue_type,sample_type
0,dd0e32d6-4704-4abd-82e4-2041fbc19a79,Tumor,Primary Tumor
1,09a491b5-1bfb-47d9-963f-69f6c7e6c1e5,Tumor,Primary Tumor
2,f8e5691a-e24d-4dba-873e-9ffa3ca12bb6,Tumor,Primary Tumor
3,65654739-bed4-4ea7-b75c-ca1f278adcc9,Tumor,Primary Tumor
4,4af55932-ab7a-41e5-aa7f-fb16ab18c629,Tumor,Primary Tumor
...,...,...,...
1120,ed716c0b-4b09-4902-86b8-6955488d60c3,Tumor,Primary Tumor
1121,0bc59ba8-158b-4309-b4d5-bb541f892fbf,Tumor,Primary Tumor
1122,bc23716d-70e8-486e-b512-8c45654d5d83,Tumor,Primary Tumor
1123,8ecef704-ed0f-4700-a34e-46a174033332,Tumor,Primary Tumor


### Files of Interest DataFrame

In [18]:
# Create the DataFrame of TCGA-BRCA files of interest with associated samples
df_files = df_eq_files \
    .merge(
        right=df_files_samples,
        left_on='file_id',
        right_on='file_id',
        how='inner'
    ) \
    [[
        'file_id',
        'case_id',
        'tissue_type',
        'sample_type',
        'experimental_strategy',
        'data_type',
        'data_format',
    ]]

In [19]:
# Print the DataFrame of TCGA-BRCA files of interest with associated samples
df_files

Unnamed: 0,file_id,case_id,tissue_type,sample_type,experimental_strategy,data_type,data_format
0,e9fe2319-2624-4ac1-9c41-d0ac44608f02,de75d0b9-0f47-4732-8df5-05c350cfcd32,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV
1,0a8016e1-0321-4421-a5ad-db2fd9caf7ab,de75d0b9-0f47-4732-8df5-05c350cfcd32,Tumor,Primary Tumor,miRNA-Seq,Isoform Expression Quantification,TXT
2,8437bf97-2354-40cf-8dc4-631c5d7ae63a,1fef9454-b3e8-4d9d-a9aa-aa1f1a32b80c,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV
3,7a4675e7-4174-4561-ae9c-b7636fbea718,1fef9454-b3e8-4d9d-a9aa-aa1f1a32b80c,Tumor,Primary Tumor,miRNA-Seq,Isoform Expression Quantification,TXT
4,ea1fadc2-1cdc-4658-9619-eeb26ae09da8,2021ed1f-dc75-4701-b8b8-1386466e4802,Normal,Solid Tissue Normal,RNA-Seq,Gene Expression Quantification,TSV
...,...,...,...,...,...,...,...
1120,7c275803-bb6d-4fbc-b851-5fcb393f8496,2cf68894-168b-458b-af4f-53cad72989a8,Tumor,Primary Tumor,miRNA-Seq,Isoform Expression Quantification,TXT
1121,9c87ef50-5854-4f69-820b-9e0b4d46c982,2cf68894-168b-458b-af4f-53cad72989a8,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV
1122,8ca0e4c1-92ab-4653-87e0-cdcc630d8db6,2cf68894-168b-458b-af4f-53cad72989a8,Normal,Solid Tissue Normal,miRNA-Seq,Isoform Expression Quantification,TXT
1123,d5d23d6a-c141-442f-ac4f-714e46b8d9ef,36d77496-99f8-4911-ba7c-89fae32662aa,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV


## Filtering Cases and Files

### Counting Files per Cases

In [20]:
# Create an acronym for file types
df_files['file_type'] = (
    df_files['experimental_strategy'] + '_' + df_files['tissue_type']
)

# Copy some columns of the files DataFrame
df_file_count = df_files[['case_id', 'file_id', 'file_type']].copy()

# Create numeric flags for the file types
df_file_count['TTmiR'] = np.where(
    df_file_count['file_type'] == 'miRNA-Seq_Tumor', 1, 0
)
df_file_count['TTmRNA'] = np.where(
    df_file_count['file_type'] == 'RNA-Seq_Tumor', 1, 0
)
df_file_count['NTmiR'] = np.where(
    df_file_count['file_type'] == 'miRNA-Seq_Normal', 1, 0
)
df_file_count['NTmRNA'] = np.where(
    df_file_count['file_type'] == 'RNA-Seq_Normal', 1, 0
)

# Sum all file types count columns
df_file_count['file_count'] = (
    df_file_count['TTmiR'] + df_file_count['TTmRNA'] 
    + df_file_count['NTmiR'] + df_file_count['NTmRNA']
)

# Count the total number of file types per case
df_file_count_agg = df_file_count \
    .groupby('case_id') \
    .agg(
        TTmiR_count=pd.NamedAgg(column='TTmiR', aggfunc='sum'),
        TTmRNA_count=pd.NamedAgg(column='TTmRNA', aggfunc='sum'),
        NTmiR_count=pd.NamedAgg(column='NTmiR', aggfunc='sum'),
        NTmRNA_count=pd.NamedAgg(column='NTmRNA', aggfunc='sum'),
        file_count=pd.NamedAgg(column='file_count', aggfunc='sum')
    ) \
    .sort_values(
        by=['file_count', 'TTmiR_count', 'TTmRNA_count', 'NTmiR_count', 'NTmRNA_count'],
        ascending=False
    )

In [21]:
# Print the DataFrame that aggregates the file types per case
df_file_count_agg.reset_index()

Unnamed: 0,case_id,TTmiR_count,TTmRNA_count,NTmiR_count,NTmRNA_count,file_count
0,8c7e74e0-71ef-49b8-9217-94b8ef740ef9,3,3,1,1,8
1,f130f376-5801-40f9-975d-a7e2f7b5670d,3,3,1,1,8
2,8785012f-f73e-4d68-87cf-1d804af32782,3,3,0,0,6
3,0bc5744c-5fa3-45bb-87d0-70a02068b392,1,1,1,1,4
4,14267783-5624-4fe5-ba81-9d67f1017474,1,1,1,1,4
...,...,...,...,...,...,...
499,cc348a26-ee11-47a4-8b51-de922967e175,0,1,0,0,1
500,dfe6db17-cf45-488e-bd3c-b8433d7343ca,0,1,0,0,1
501,e3c336f5-c32f-4c5d-81fb-e2408ae145b2,0,1,0,0,1
502,ec6930e5-2d83-435a-9a57-a2c5c22caf02,0,1,0,0,1


### Cases of Interest DataFrames

#### Cases with Tumor Tissue Analysis

In [22]:
# Filter cases by files associated with tumor tissue
df_tumor_cases = df_file_count_agg \
    .query('(TTmiR_count == 1) and (TTmRNA_count == 1)') \
    .reset_index()

# Create the DataFrame of cases of interest with tumor tissue analysis
df_tumor_cases = df_tumor_cases \
    .merge(
        right=df_api_cases,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    [['case_id', 'submitter_id', 'disease_type', 'pam50_mrna', 'files']]

# Store the DataFrame in a CSV file
file_name = 'brca-api-cases-tumor-analysis.csv'
df_tumor_cases.to_csv(f'{BRCA_INTERIM_DATA_DIR}/{file_name}', index=False)

In [23]:
# Print the DataFrame of cases of interest with tumor tissue analysis
df_tumor_cases

Unnamed: 0,case_id,submitter_id,disease_type,pam50_mrna,files
0,0bc5744c-5fa3-45bb-87d0-70a02068b392,TCGA-BH-A0DG,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BEDPE', 'file_id': 'f1c01ec0-cf13-4009-a4a1-23665580ab6e', 'data_type': 'Structural Rearrangement', 'experimental_strategy': 'WGS'}, {'data_format': 'BEDPE', 'file_id': '50e7aa21..."
1,14267783-5624-4fe5-ba81-9d67f1017474,TCGA-BH-A0DP,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BCR Biotab', 'file_id': '0a40467f-9495-4c5b-b56e-3347a3ee0572', 'data_type': 'Clinical Supplement'}, {'data_format': 'BAM', 'file_id': '15a12710-1f2b-41aa-8cf7-054b67f29fbe', 'da..."
2,17baef7c-d97d-4b98-ab53-503ef856523d,TCGA-A7-A0D9,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BAM', 'file_id': '8c151dbe-9b21-4fdc-8664-a610ef423606', 'data_type': 'Aligned Reads', 'experimental_strategy': 'WGS'}, {'data_format': 'BEDPE', 'file_id': 'cc14571e-62b5-4f0a-84..."
3,17c1d42c-cb84-4655-a4cd-b54bae17ecaf,TCGA-E2-A15K,Ductal and Lobular Neoplasms,Luminal B,"[{'data_format': 'TXT', 'file_id': 'db86d78c-d6e7-4c62-8942-91b839270b2b', 'data_type': 'Copy Number Segment', 'experimental_strategy': 'Genotyping Array'}, {'data_format': 'SVS', 'file_id': '96ce..."
4,18eb4dfc-556f-4bf3-a411-4780209ed1e0,TCGA-BH-A0B3,Ductal and Lobular Neoplasms,Basal-like,"[{'data_format': 'VCF', 'file_id': '9c9e9481-617b-4ecb-8416-07d62453fa8b', 'data_type': 'Structural Rearrangement', 'experimental_strategy': 'WGS'}, {'data_format': 'TXT', 'file_id': '19259452-1a7..."
...,...,...,...,...,...
481,fb356957-f620-4d99-b675-1355a1d9f932,TCGA-E2-A14O,Ductal and Lobular Neoplasms,Luminal B,"[{'data_format': 'SVS', 'file_id': '7846cf1d-aab1-4e14-9f18-f5f7ca4a52bc', 'data_type': 'Slide Image', 'experimental_strategy': 'Tissue Slide'}, {'data_format': 'TXT', 'file_id': '6b9417c3-399d-48..."
482,fb57c598-b45b-478a-9d34-ca1f050d5b7c,TCGA-BH-A0DS,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'SVS', 'file_id': 'f8f16824-4f10-4b71-a8fc-0a2c3176c16a', 'data_type': 'Slide Image', 'experimental_strategy': 'Tissue Slide'}, {'data_format': 'TSV', 'file_id': '0dc31470-2ad5-4e..."
483,fc18d029-9be2-4fa0-9aef-6d647dc55f0b,TCGA-D8-A147,Ductal and Lobular Neoplasms,Basal-like,"[{'data_format': 'TXT', 'file_id': '7da02dc3-caf0-4809-99f0-8f5cd0ebff0b', 'data_type': 'Allele-specific Copy Number Segment', 'experimental_strategy': 'Genotyping Array'}, {'data_format': 'TSV', ..."
484,fcef8cb5-fb2c-4bfb-82cd-6b9f3145182c,TCGA-BH-A0HP,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'VCF', 'file_id': '64c1118e-5ac4-46e1-9475-7499d19a894b', 'data_type': 'Annotated Somatic Mutation', 'experimental_strategy': 'WXS'}, {'data_format': 'SVS', 'file_id': '3fd44a43-5..."


#### Cases with Normal Tissue Analysis

In [24]:
# Filter cases by files associated with normal tissue
df_normal_cases = df_file_count_agg \
    .query('(NTmiR_count == 1) and (NTmRNA_count == 1)') \
    .reset_index()

# Create the DataFrame of cases of interest with normal tissue analysis
df_normal_cases = df_normal_cases \
    .merge(
        right=df_api_cases,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    [['case_id', 'submitter_id', 'disease_type', 'pam50_mrna', 'files']]

# Store the DataFrame in a CSV file
file_name = 'brca-api-cases-normal-analysis.csv'
df_normal_cases.to_csv(f'{BRCA_INTERIM_DATA_DIR}/{file_name}', index=False)

In [25]:
# Print the DataFrame of cases of interest with normal tissue analysis
df_normal_cases

Unnamed: 0,case_id,submitter_id,disease_type,pam50_mrna,files
0,8c7e74e0-71ef-49b8-9217-94b8ef740ef9,TCGA-A7-A13E,Ductal and Lobular Neoplasms,Basal-like,"[{'data_format': 'TSV', 'file_id': '767bba1d-e80f-49fa-82b1-075a882d0a10', 'data_type': 'Simple Germline Variation', 'experimental_strategy': 'Genotyping Array'}, {'data_format': 'MAF', 'file_id':..."
1,f130f376-5801-40f9-975d-a7e2f7b5670d,TCGA-A7-A0DB,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'VCF', 'file_id': '886bfc02-b727-47dd-8806-5a34714f8857', 'data_type': 'Annotated Somatic Mutation', 'experimental_strategy': 'WXS'}, {'data_format': 'VCF', 'file_id': 'ef67d03e-b..."
2,0bc5744c-5fa3-45bb-87d0-70a02068b392,TCGA-BH-A0DG,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BEDPE', 'file_id': 'f1c01ec0-cf13-4009-a4a1-23665580ab6e', 'data_type': 'Structural Rearrangement', 'experimental_strategy': 'WGS'}, {'data_format': 'BEDPE', 'file_id': '50e7aa21..."
3,14267783-5624-4fe5-ba81-9d67f1017474,TCGA-BH-A0DP,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BCR Biotab', 'file_id': '0a40467f-9495-4c5b-b56e-3347a3ee0572', 'data_type': 'Clinical Supplement'}, {'data_format': 'BAM', 'file_id': '15a12710-1f2b-41aa-8cf7-054b67f29fbe', 'da..."
4,17baef7c-d97d-4b98-ab53-503ef856523d,TCGA-A7-A0D9,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BAM', 'file_id': '8c151dbe-9b21-4fdc-8664-a610ef423606', 'data_type': 'Aligned Reads', 'experimental_strategy': 'WGS'}, {'data_format': 'BEDPE', 'file_id': 'cc14571e-62b5-4f0a-84..."
5,17c1d42c-cb84-4655-a4cd-b54bae17ecaf,TCGA-E2-A15K,Ductal and Lobular Neoplasms,Luminal B,"[{'data_format': 'TXT', 'file_id': 'db86d78c-d6e7-4c62-8942-91b839270b2b', 'data_type': 'Copy Number Segment', 'experimental_strategy': 'Genotyping Array'}, {'data_format': 'SVS', 'file_id': '96ce..."
6,18eb4dfc-556f-4bf3-a411-4780209ed1e0,TCGA-BH-A0B3,Ductal and Lobular Neoplasms,Basal-like,"[{'data_format': 'VCF', 'file_id': '9c9e9481-617b-4ecb-8416-07d62453fa8b', 'data_type': 'Structural Rearrangement', 'experimental_strategy': 'WGS'}, {'data_format': 'TXT', 'file_id': '19259452-1a7..."
7,2021ed1f-dc75-4701-b8b8-1386466e4802,TCGA-BH-A0H7,Ductal and Lobular Neoplasms,Luminal A,"[{'data_format': 'BAM', 'file_id': 'aba03be3-9b5a-44b3-97a7-d89c37c8b721', 'data_type': 'Aligned Reads', 'experimental_strategy': 'RNA-Seq'}, {'data_format': 'TXT', 'file_id': '70a0fb72-e7ca-4fe5-..."
8,214a4507-d974-4b3e-8525-7408fccc6a0f,TCGA-BH-A0B7,Ductal and Lobular Neoplasms,HER2-enriched,"[{'data_format': 'BEDPE', 'file_id': 'a1a6f518-7460-42cb-b32e-350b0398cfdc', 'data_type': 'Structural Rearrangement', 'experimental_strategy': 'WGS'}, {'data_format': 'TSV', 'file_id': 'f50fdcd8-7..."
9,21ef1730-e5a7-47ce-b419-d000bb59ae15,TCGA-BH-A1F0,Ductal and Lobular Neoplasms,Basal-like,"[{'data_format': 'SVS', 'file_id': 'db7e4dc3-e9e1-47bd-ab03-7b48e0b1beaa', 'data_type': 'Slide Image', 'experimental_strategy': 'Tissue Slide'}, {'data_format': 'BCR Biotab', 'file_id': '0a40467f-..."


### Files of Interest DataFrames

#### Files Associated with Tumor Tissue Analysis

In [26]:
# List with the UUIDs of the cases of interest with tumor tissue analysis
tumor_cases = df_tumor_cases['case_id'].tolist()

# Create the DataFrame of files of interest associated with tumor tissue analysis
df_tumor_files = df_files \
    .query('(tissue_type == "Tumor") and (case_id == @tumor_cases)') \
    .drop(columns='file_type') \
    .reset_index(drop=True)

# Store the DataFrame in a CSV file
file_name = 'brca-api-files-tumor-analysis.csv'
df_tumor_files.to_csv(f'{BRCA_INTERIM_DATA_DIR}/{file_name}', index=False)

In [27]:
# Print the DataFrame of files of interest associated with tumor tissue analysis
df_tumor_files

Unnamed: 0,file_id,case_id,tissue_type,sample_type,experimental_strategy,data_type,data_format
0,e9fe2319-2624-4ac1-9c41-d0ac44608f02,de75d0b9-0f47-4732-8df5-05c350cfcd32,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV
1,0a8016e1-0321-4421-a5ad-db2fd9caf7ab,de75d0b9-0f47-4732-8df5-05c350cfcd32,Tumor,Primary Tumor,miRNA-Seq,Isoform Expression Quantification,TXT
2,8437bf97-2354-40cf-8dc4-631c5d7ae63a,1fef9454-b3e8-4d9d-a9aa-aa1f1a32b80c,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV
3,7a4675e7-4174-4561-ae9c-b7636fbea718,1fef9454-b3e8-4d9d-a9aa-aa1f1a32b80c,Tumor,Primary Tumor,miRNA-Seq,Isoform Expression Quantification,TXT
4,15f5f60a-13f0-41a2-8711-171d19c2903a,2021ed1f-dc75-4701-b8b8-1386466e4802,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV
...,...,...,...,...,...,...,...
967,8c884351-1c8b-47ff-8bef-1a4f25f645fb,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV
968,7c275803-bb6d-4fbc-b851-5fcb393f8496,2cf68894-168b-458b-af4f-53cad72989a8,Tumor,Primary Tumor,miRNA-Seq,Isoform Expression Quantification,TXT
969,9c87ef50-5854-4f69-820b-9e0b4d46c982,2cf68894-168b-458b-af4f-53cad72989a8,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV
970,d5d23d6a-c141-442f-ac4f-714e46b8d9ef,36d77496-99f8-4911-ba7c-89fae32662aa,Tumor,Primary Tumor,RNA-Seq,Gene Expression Quantification,TSV


#### Files Associated with Normal Tissue Analysis

In [28]:
# List with the UUIDs of the cases of interest with normal tissue analysis
normal_cases = df_normal_cases['case_id'].tolist()

# Create the DataFrame of files of interest associated with normal tissue analysis
df_normal_files = df_files \
    .query('(tissue_type == "Normal") and (case_id == @normal_cases)') \
    .drop(columns='file_type') \
    .reset_index(drop=True)

# Store the DataFrame in a CSV file
file_name = 'brca-api-files-normal-analysis.csv'
df_normal_files.to_csv(f'{BRCA_INTERIM_DATA_DIR}/{file_name}', index=False)

In [29]:
# Print the DataFrame of files of interest associated with normal tissue analysis
df_normal_files

Unnamed: 0,file_id,case_id,tissue_type,sample_type,experimental_strategy,data_type,data_format
0,ea1fadc2-1cdc-4658-9619-eeb26ae09da8,2021ed1f-dc75-4701-b8b8-1386466e4802,Normal,Solid Tissue Normal,RNA-Seq,Gene Expression Quantification,TSV
1,ef076f15-36a4-4e8a-a624-8cdeb9c4fa2d,2021ed1f-dc75-4701-b8b8-1386466e4802,Normal,Solid Tissue Normal,miRNA-Seq,Isoform Expression Quantification,TXT
2,3f5ad0e5-ab95-4c1e-8dfe-09cab1d1500c,214a4507-d974-4b3e-8525-7408fccc6a0f,Normal,Solid Tissue Normal,miRNA-Seq,Isoform Expression Quantification,TXT
3,3071e512-94ea-4820-9573-668235188e34,214a4507-d974-4b3e-8525-7408fccc6a0f,Normal,Solid Tissue Normal,RNA-Seq,Gene Expression Quantification,TSV
4,8790fe0b-8d3a-4f2c-8201-1231da243697,21ef1730-e5a7-47ce-b419-d000bb59ae15,Normal,Solid Tissue Normal,miRNA-Seq,Isoform Expression Quantification,TXT
...,...,...,...,...,...,...,...
107,d4a3af62-0505-49f6-9f34-168bec5cb0d0,a851ca14-7bbf-455b-a054-ec03979a4247,Normal,Solid Tissue Normal,miRNA-Seq,Isoform Expression Quantification,TXT
108,cd746011-8fd7-473d-84e7-f39ec60b6717,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,Normal,Solid Tissue Normal,miRNA-Seq,Isoform Expression Quantification,TXT
109,f24d6e88-634e-4d86-b993-76a40f1190d0,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,Normal,Solid Tissue Normal,RNA-Seq,Gene Expression Quantification,TSV
110,4c2bb8e3-2527-466b-a3bb-0f36c3b2a3e7,2cf68894-168b-458b-af4f-53cad72989a8,Normal,Solid Tissue Normal,RNA-Seq,Gene Expression Quantification,TSV
