# **Preprocessing Metadata from TCGA-BRCA Project**
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import json
import os
import sys

import numpy as np
import pandas as pd
import requests

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    BRCA_INTERIM_FILE_PATHS,
    BRCA_PAPER_FILE_PATH,
    BRCA_PREPROCESSING_PARAMETERS,
    GDC_API_ENDPOINTS,
)

# Function

In [2]:
def gdc_api_request(endpoint, fields, filters={}, size=1):
    """
    Send a request to a Genomic Data Commons (GDC) API endpoint with the specified parameters.

    Parameters:
    -----------
    endpoint : str
        The URL of the endpoint to send the request to.
    fields : list of str
        A list of fields to include in the response.
    filters : dict, optional
        A dictionary of filters to apply to the request. Default is an empty dictionary.
    size : int, optional
        The number of results to retrieve. Default is 1.

    Returns:
    --------
    list
        A list of hits (data items) retrieved from the endpoint response.
    """
    # Parameters used in the endpoint request
    params = {
        'fields': ','.join(fields),
        'filters': filters,
        'size': str(size)
    }
    
    # Request the objects of interest to the endpoint
    response = requests.post(
        url=endpoint,
        headers={'Content-Type': 'application/json'},
        json=params
    )
    response.raise_for_status()

    # Convert the response content to a JSON
    json_response = json.loads(response.content.decode('utf-8'))

    return json_response['data']['hits']

# TCGA-BRCA Paper Metadata
The Cancer Genome Atlas Network. Comprehensive molecular portraits of human breast tumours. Nature 490, 61–70 (2012). https://doi.org/10.1038/nature11412.

## Loading and Flagging Data

In [3]:
# Create the DataFrame of 'Supplementary Table 1' from the TCGA-BRCA paper
df_paper_cases = pd.read_excel(
    BRCA_PAPER_FILE_PATH, sheet_name='SuppTable1', skiprows=1
)

# Standardize DataFrame column names
columns = dict()
for name in list(df_paper_cases.columns):
    columns[name] = name.replace(' ', '_').lower()
df_paper_cases = df_paper_cases.rename(columns=columns)

# Flag the cases of interest, i.e., those classified into specific molecular
# subtypes of breast cancer by the PAM50 signature
molecular_subtypes = BRCA_PREPROCESSING_PARAMETERS['molecular_subtypes']
df_paper_cases['is_case_of_interest'] = np.where(
    df_paper_cases['pam50_mrna'].isin(molecular_subtypes), 1, 0
)

## Flagged Paper Cases DataFrame

In [4]:
# Store the DataFrame of flagged cases in a CSV file
df_paper_cases.to_csv(BRCA_INTERIM_FILE_PATHS['paper'], index=False)

In [5]:
# Print the DataFrame of flagged cases from the TCGA-BRCA paper
df_paper_cases

Unnamed: 0,complete_tcga_id,gender,age_at_initial_pathologic_diagnosis,er_status,pr_status,her2_final_status,tumor,tumor--t1_coded,node,node-coded,...,sigclust_unsupervised_mrna,sigclust_intrinsic_mrna,mirna_clusters,methylation_clusters,rppa_clusters,cn_clusters,integrated_clusters_(with_pam50),integrated_clusters_(no_exp),integrated_clusters_(unsup_exp),is_case_of_interest
0,TCGA-A2-A0T2,FEMALE,66.0,Negative,Negative,Negative,T3,T_Other,N3,Positive,...,0.0,-13.0,3.0,5.0,Basal,3.0,2.0,2.0,2.0,1
1,TCGA-A2-A04P,FEMALE,36.0,Negative,Negative,Negative,T2,T_Other,N3,Positive,...,0.0,-13.0,5.0,5.0,Basal,1.0,2.0,2.0,2.0,1
2,TCGA-A1-A0SK,FEMALE,54.0,Negative,Negative,Negative,T2,T_Other,N0,Negative,...,-6.0,-13.0,5.0,5.0,Basal,1.0,2.0,2.0,2.0,1
3,TCGA-A2-A0CM,FEMALE,40.0,Negative,Negative,Negative,T2,T_Other,N0,Negative,...,-12.0,-13.0,4.0,4.0,Basal,4.0,2.0,1.0,1.0,1
4,TCGA-AR-A1AR,FEMALE,50.0,Negative,Negative,Negative,T1,T1,N2,Positive,...,0.0,-13.0,5.0,5.0,,1.0,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,TCGA-AC-A2FF,,,,,,,,,,...,,,,1.0,,1.0,,,,0
821,TCGA-AC-A2FB,,,,,,,,,,...,,,,1.0,,3.0,,,,0
822,TCGA-AC-A2FG,,,,,,,,,,...,,,,1.0,,2.0,,,,0
823,TCGA-GI-A2C8,,,,,,,,,,...,,,,3.0,,3.0,,,,0


# TCGA-BRCA Metadata from GDC API
GDC: Genomic Data Commons

## Status

In [6]:
# Request the API version and status to the endpoint
response = requests.get(GDC_API_ENDPOINTS['status'])

# Print the response content
try:
    parsed = json.loads(response.content.decode('utf-8'))
    print(json.dumps(parsed, indent=4))
except json.JSONDecodeError:
    print('Response is not valid JSON:')
    print(response.content.decode('utf-8'))

{
    "commit": "4bb408881e6dc67eca93ff9fd913629a8f2d11c2",
    "data_release": "Data Release 42.0 - January 30, 2025",
    "data_release_version": {
        "major": 42,
        "minor": 0,
        "release_date": "2025-01-30"
    },
    "status": "OK",
    "tag": "7.8.5",
    "version": 1
}


## Projects

### Endpoint Response

In [7]:
# Fields available in the endpoint
fields = [
    'disease_type',
    'name',
    'primary_site',
    'summary.case_count',
    'summary.data_categories.case_count',
    'summary.data_categories.data_category',
    'summary.data_categories.file_count',
    'summary.experimental_strategies.case_count',
    'summary.experimental_strategies.experimental_strategy',
    'summary.experimental_strategies.file_count',
    'summary.file_count',
]

# Filter to be used in the projects endpoint request
filters = {
    'op': 'in',
    'content': {
        'field': 'projects.project_id',
        'value': BRCA_PREPROCESSING_PARAMETERS['project_ids']
    }
}

# Get the response from the projects endpoint request
projects_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['projects'],
    fields=fields,
    filters=filters,
    size=len(BRCA_PREPROCESSING_PARAMETERS['project_ids'])
)

### Project DataFrame

In [8]:
# Create the TCGA-BRCA project metadata DataFrame
df_project = pd.json_normalize(projects_response) \
    .rename(columns={
        'id': 'project_id',
        'name': 'project_name',
        'summary.case_count': 'case_count',
        'summary.data_categories': 'data_categories',
        'summary.experimental_strategies': 'experimental_strategies',
        'summary.file_count': 'file_count',
    }) \
    [[
        'project_id',
        'project_name',
        'primary_site',
        'disease_type',
        'experimental_strategies',
        'data_categories',
        'case_count',
        'file_count',
    ]]

# Store the project metadata DataFrame in a CSV file
df_project.to_csv(BRCA_INTERIM_FILE_PATHS['project'], index=False)

In [9]:
# Print the TCGA-BRCA project metadata DataFrame
pd.set_option('display.max_colwidth', 900)
df_project

Unnamed: 0,project_id,project_name,primary_site,disease_type,experimental_strategies,data_categories,case_count,file_count
0,TCGA-BRCA,Breast Invasive Carcinoma,[Breast],"[Epithelial Neoplasms, NOS, Adnexal and Skin Appendage Neoplasms, Squamous Cell Neoplasms, Adenomas and Adenocarcinomas, Complex Epithelial Neoplasms, Fibroepithelial Neoplasms, Cystic, Mucinous and Serous Neoplasms, Basal Cell Neoplasms, Ductal and Lobular Neoplasms]","[{'file_count': 11079, 'case_count': 1095, 'experimental_strategy': 'RNA-Seq'}, {'file_count': 17049, 'case_count': 1072, 'experimental_strategy': 'WXS'}, {'file_count': 3621, 'case_count': 1079, 'experimental_strategy': 'miRNA-Seq'}, {'file_count': 10572, 'case_count': 952, 'experimental_strategy': 'WGS'}, {'file_count': 75, 'case_count': 74, 'experimental_strategy': 'ATAC-Seq'}, {'file_count': 14329, 'case_count': 1098, 'experimental_strategy': 'Genotyping Array'}, {'file_count': 3714, 'case_count': 1097, 'experimental_strategy': 'Methylation Array'}, {'file_count': 919, 'case_count': 881, 'experimental_strategy': 'Reverse Phase Protein Array'}, {'file_count': 1133, 'case_count': 1062, 'experimental_strategy': 'Diagnostic Slide'}, {'file_count': 1978, 'case_count': 1093, 'experimental_strategy': 'Tissue Slide'}]","[{'file_count': 19753, 'case_count': 1098, 'data_category': 'Simple Nucleotide Variation'}, {'file_count': 9282, 'case_count': 1098, 'data_category': 'Sequencing Reads'}, {'file_count': 5316, 'case_count': 1098, 'data_category': 'Biospecimen'}, {'file_count': 2288, 'case_count': 1098, 'data_category': 'Clinical'}, {'file_count': 14346, 'case_count': 1098, 'data_category': 'Copy Number Variation'}, {'file_count': 4876, 'case_count': 1097, 'data_category': 'Transcriptome Profiling'}, {'file_count': 3714, 'case_count': 1097, 'data_category': 'DNA Methylation'}, {'file_count': 919, 'case_count': 881, 'data_category': 'Proteome Profiling'}, {'file_count': 2696, 'case_count': 784, 'data_category': 'Somatic Structural Variation'}, {'file_count': 5772, 'case_count': 1098, 'data_category': 'Structural Variation'}]",1098,68962


## Cases

### Endpoint Response

In [10]:
# Fields of interest for the cases endpoint request
fields = ['disease_type', 'submitter_id',]

# Filter to be used in the cases endpoint request
filters = {
    'op': 'in',
    'content': {
        'field': 'project.project_id',
        'value': BRCA_PREPROCESSING_PARAMETERS['project_ids']
    }
}

# Get the response from the cases endpoint request
cases_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['cases'],
    fields=fields,
    filters=filters,
    size=sum(df_project['case_count'])
)

### Cases DataFrame

In [11]:
# Create the TCGA-BRCA cases metadata DataFrame
df_api_cases = pd.json_normalize(cases_response) \
    .rename(columns={'id': 'case_id'}) \
    [['case_id', 'submitter_id', 'disease_type']]

In [12]:
# Print the TCGA-BRCA cases metadata DataFrame
df_api_cases

Unnamed: 0,case_id,submitter_id,disease_type
0,2d29a4ac-98e7-4663-9dd6-5681bc32ac2e,TCGA-A2-A3KC,Ductal and Lobular Neoplasms
1,de531604-bd1a-49df-96c2-6de3ae703f1d,TCGA-D8-A1XG,Ductal and Lobular Neoplasms
2,de646f6a-6d50-4be2-b3cd-f17554b234df,TCGA-AR-A24L,Ductal and Lobular Neoplasms
3,de75d0b9-0f47-4732-8df5-05c350cfcd32,TCGA-AN-A0FY,Ductal and Lobular Neoplasms
4,deba32e4-0e68-4711-941b-3b63bd965afb,TCGA-A2-A3XY,Ductal and Lobular Neoplasms
...,...,...,...
1093,35bd694d-1dd2-466f-ab27-03320614b40e,TCGA-OL-A66I,Ductal and Lobular Neoplasms
1094,2c90d5e3-ac8c-4c86-a06f-42ccc064eb49,TCGA-D8-A1XZ,Ductal and Lobular Neoplasms
1095,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,TCGA-BH-A0AU,Ductal and Lobular Neoplasms
1096,2cf68894-168b-458b-af4f-53cad72989a8,TCGA-A7-A13F,Ductal and Lobular Neoplasms


## Files

### Endpoint Response

In [13]:
# Fields of interest for the files endpoint request
fields = [
    'cases.case_id',
    'cases.samples.sample_type',
    'cases.samples.tissue_type',
    'data_format',
    'data_type',
    'experimental_strategy',
]

# Filter to be used in the files endpoint request
filters = {
    'op': 'in',
    'content': {
        'field': 'cases.project.project_id',
        'value': BRCA_PREPROCESSING_PARAMETERS['project_ids']
    }
}

# Get the response from the files endpoint request
files_response = gdc_api_request(
    endpoint=GDC_API_ENDPOINTS['files'],
    fields=fields,
    filters=filters,
    size=sum(df_project['file_count'])
)

### Files DataFrame

In [14]:
# Create the TCGA-BRCA files metadata DataFrame
df_files = pd.json_normalize(files_response) \
    .rename(columns={'id': 'file_id'})

# Explode lists with dictionaries containing cases metadata
df_files = df_files.explode('cases')

# Explode dictionary contents with cases metadata into columns
df_files = pd.concat(
    objs=[
        df_files.reset_index(drop=True),
        pd.json_normalize(df_files['cases'])
    ],
    axis='columns'
)

# Reorganize the DataFrame columns
df_files = df_files \
    .drop(columns=['cases']) \
    [[
        'file_id',
        'case_id',
        'experimental_strategy',
        'data_type',
        'data_format',
        'samples',
    ]]

In [15]:
# Print the TCGA-BRCA files metadata DataFrame
pd.set_option('display.max_colwidth', 300)
df_files

Unnamed: 0,file_id,case_id,experimental_strategy,data_type,data_format,samples
0,27016c1c-f696-4a54-8c4a-6f9ba8a1c9e3,3b1f7e48-216d-47de-ba79-db350f52bff0,WXS,Annotated Somatic Mutation,VCF,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
1,776f51f8-6f23-42d5-a96b-d7c934dcaa12,3b1f7e48-216d-47de-ba79-db350f52bff0,WXS,Aligned Reads,BAM,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
2,b1680590-4e97-4a3d-b41b-d9e84290de17,3b1f7e48-216d-47de-ba79-db350f52bff0,RNA-Seq,Aligned Reads,BAM,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
3,65cab127-616b-45c7-98b4-fdf8e01881fb,3b1f7e48-216d-47de-ba79-db350f52bff0,WGS,Aligned Reads,BAM,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}]"
4,a0409565-1baf-46f4-bae9-85a1dbe34160,3b1f7e48-216d-47de-ba79-db350f52bff0,,Biospecimen Supplement,BCR XML,
...,...,...,...,...,...,...
89800,cc6ca81e-5bb7-4c87-9602-ec5919b443a8,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,WGS,Aligned Reads,BAM,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}]"
89801,82e26986-3016-4302-8929-e8a10078bd1a,0130d616-885e-4a6c-9d03-2f17dd692a05,WGS,Structural Rearrangement,VCF,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
89802,6f4842e1-5b24-45ec-b03e-b81b63050111,0130d616-885e-4a6c-9d03-2f17dd692a05,Methylation Array,Masked Intensities,IDAT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]"
89803,c88d52bf-0d3e-4e13-87c6-c33b29417bc6,00a2d166-78c9-4687-a195-3d6315c27574,WGS,Annotated Somatic Mutation,VCF,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}, {'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}]"


# Cases and Files of Interest

## Potential Cases of Interest

In [16]:
# Filter the cases of interest from the paper
df_paper_cases_of_interest = df_paper_cases \
    .query('is_case_of_interest == 1') \
    [['complete_tcga_id', 'pam50_mrna']]

# Retrieve the molecular subtype classification of the cases
df_cases = df_api_cases \
    .merge(
        right=df_paper_cases_of_interest,
        left_on='submitter_id',
        right_on='complete_tcga_id',
        how='left'
    ) \
    [['case_id', 'submitter_id', 'disease_type', 'pam50_mrna']]
    
# Flag the cases of interest, i.e., those classified into a molecular
# subtype of breast cancer and a specific type of disease
disease_types = BRCA_PREPROCESSING_PARAMETERS['disease_types']
df_cases['is_case_of_interest'] = np.where(
    ((df_cases['disease_type'].isin(disease_types)) 
     & (df_cases['pam50_mrna'].notna())), 1, 0
)

In [17]:
# Print the DataFrame of flagged cases
df_cases

Unnamed: 0,case_id,submitter_id,disease_type,pam50_mrna,is_case_of_interest
0,2d29a4ac-98e7-4663-9dd6-5681bc32ac2e,TCGA-A2-A3KC,Ductal and Lobular Neoplasms,,0
1,de531604-bd1a-49df-96c2-6de3ae703f1d,TCGA-D8-A1XG,Ductal and Lobular Neoplasms,,0
2,de646f6a-6d50-4be2-b3cd-f17554b234df,TCGA-AR-A24L,Ductal and Lobular Neoplasms,,0
3,de75d0b9-0f47-4732-8df5-05c350cfcd32,TCGA-AN-A0FY,Ductal and Lobular Neoplasms,Luminal B,1
4,deba32e4-0e68-4711-941b-3b63bd965afb,TCGA-A2-A3XY,Ductal and Lobular Neoplasms,,0
...,...,...,...,...,...
1093,35bd694d-1dd2-466f-ab27-03320614b40e,TCGA-OL-A66I,Ductal and Lobular Neoplasms,,0
1094,2c90d5e3-ac8c-4c86-a06f-42ccc064eb49,TCGA-D8-A1XZ,Ductal and Lobular Neoplasms,,0
1095,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,TCGA-BH-A0AU,Ductal and Lobular Neoplasms,Luminal B,1
1096,2cf68894-168b-458b-af4f-53cad72989a8,TCGA-A7-A13F,Ductal and Lobular Neoplasms,Luminal B,1


## Potential Files of Interest

In [18]:
# Retrieve the flag of interest of the cases
df_files = df_files \
    .merge(
        right=df_cases,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    .drop(columns=['submitter_id', 'disease_type', 'pam50_mrna'])

# Filter the files based on data type and case
data_types = BRCA_PREPROCESSING_PARAMETERS['data_types']
df_files_of_interest = df_files \
    .query('(data_type == @data_types) and (is_case_of_interest == 1)')

# Explode the lists with dictionaries containing samples metadata
df_files_of_interest = df_files_of_interest.explode('samples')
df_files_of_interest = pd.concat(
    objs=[
        df_files_of_interest.reset_index(drop=True),
        pd.json_normalize(df_files_of_interest['samples'])
    ],
    axis='columns'
)

# Rearrange the DataFrame columns
df_files_of_interest = df_files_of_interest \
    .drop(columns=[
        'experimental_strategy',
        'data_type',
        'data_format',
        'samples',
        'is_case_of_interest',
    ])
    
# Flag the files of interest, i.e., those related to a case
# of interest, specific data type, and specific sample type
sample_types = BRCA_PREPROCESSING_PARAMETERS['sample_types']
df_files_of_interest['is_file_of_interest'] = np.where(
    df_files_of_interest['sample_type'].isin(sample_types), 1, 0
)

# Extend the flag column to all files
df_files = df_files \
    .merge(
        right=df_files_of_interest,
        left_on=['file_id', 'case_id'],
        right_on=['file_id', 'case_id'],
        how='left'
    ) \
    .drop(columns=['is_case_of_interest']) \
    .fillna(value={'is_file_of_interest': 0}) \
    .astype({'is_file_of_interest': 'int'})

In [19]:
# Print the DataFrame of flagged files
df_files

Unnamed: 0,file_id,case_id,experimental_strategy,data_type,data_format,samples,sample_type,tissue_type,is_file_of_interest
0,27016c1c-f696-4a54-8c4a-6f9ba8a1c9e3,3b1f7e48-216d-47de-ba79-db350f52bff0,WXS,Annotated Somatic Mutation,VCF,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",,,0
1,776f51f8-6f23-42d5-a96b-d7c934dcaa12,3b1f7e48-216d-47de-ba79-db350f52bff0,WXS,Aligned Reads,BAM,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",,,0
2,b1680590-4e97-4a3d-b41b-d9e84290de17,3b1f7e48-216d-47de-ba79-db350f52bff0,RNA-Seq,Aligned Reads,BAM,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",,,0
3,65cab127-616b-45c7-98b4-fdf8e01881fb,3b1f7e48-216d-47de-ba79-db350f52bff0,WGS,Aligned Reads,BAM,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}]",,,0
4,a0409565-1baf-46f4-bae9-85a1dbe34160,3b1f7e48-216d-47de-ba79-db350f52bff0,,Biospecimen Supplement,BCR XML,,,,0
...,...,...,...,...,...,...,...,...,...
89800,94468afc-faea-4091-af0f-d2df8aec37da,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,RNA-Seq,Gene Expression Quantification,TSV,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",,,0
89801,9bc60601-f424-492f-acbc-38751264d066,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,WGS,Raw Simple Somatic Mutation,VCF,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",,,0
89802,84b47afd-7fb1-4a52-8328-c257a7676176,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,WXS,Annotated Somatic Mutation,VCF,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",,,0
89803,6d870c48-0176-4137-96b9-7f5bb02100d2,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,miRNA-Seq,Aligned Reads,BAM,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",,,0


## Counting Files per Case

In [20]:
# Filter the potential files of interest
df_files_of_interest = df_files \
    .query('is_file_of_interest == 1') \
    [['file_id', 'case_id', 'experimental_strategy', 'tissue_type']]

# Create an acronym for file type, i.e, experimental strategy and tissue type
df_files_of_interest['file_type'] = (
    df_files_of_interest['experimental_strategy'] 
    + '_' + df_files_of_interest['tissue_type']
)

# Copy some columns of the DataFrame
df_file_count = df_files_of_interest[['case_id', 'file_id', 'file_type']].copy()

# Create flags for each file types
df_file_count['tt_mir'] = np.where(
    df_file_count['file_type'] == 'miRNA-Seq_Tumor', 1, 0
)
df_file_count['tt_mrna'] = np.where(
    df_file_count['file_type'] == 'RNA-Seq_Tumor', 1, 0
)
df_file_count['nt_mir'] = np.where(
    df_file_count['file_type'] == 'miRNA-Seq_Normal', 1, 0
)
df_file_count['nt_mrna'] = np.where(
    df_file_count['file_type'] == 'RNA-Seq_Normal', 1, 0
)

# Initialize the file count per row
df_file_count['total_files'] = 1

# Count the total number of file types per case
df_file_count_agg = df_file_count \
    .groupby('case_id') \
    .agg(
        tumor_mir_files=pd.NamedAgg(column='tt_mir', aggfunc='sum'),
        tumor_mrna_files=pd.NamedAgg(column='tt_mrna', aggfunc='sum'),
        normal_mir_files=pd.NamedAgg(column='nt_mir', aggfunc='sum'),
        normal_mrna_files=pd.NamedAgg(column='nt_mrna', aggfunc='sum'),
        total_files=pd.NamedAgg(column='total_files', aggfunc='sum'),
    ) \
    .sort_values(by=['total_files'], ascending=False) \
    .reset_index()
    
# Add the file count to the cases DataFrame
df_cases = df_cases \
    .merge(
        right=df_file_count_agg,
        left_on='case_id',
        right_on='case_id',
        how='left'
    )

In [21]:
# Print the DataFrame of flagged cases with file count
df_cases

Unnamed: 0,case_id,submitter_id,disease_type,pam50_mrna,is_case_of_interest,tumor_mir_files,tumor_mrna_files,normal_mir_files,normal_mrna_files,total_files
0,2d29a4ac-98e7-4663-9dd6-5681bc32ac2e,TCGA-A2-A3KC,Ductal and Lobular Neoplasms,,0,,,,,
1,de531604-bd1a-49df-96c2-6de3ae703f1d,TCGA-D8-A1XG,Ductal and Lobular Neoplasms,,0,,,,,
2,de646f6a-6d50-4be2-b3cd-f17554b234df,TCGA-AR-A24L,Ductal and Lobular Neoplasms,,0,,,,,
3,de75d0b9-0f47-4732-8df5-05c350cfcd32,TCGA-AN-A0FY,Ductal and Lobular Neoplasms,Luminal B,1,1.0,1.0,0.0,0.0,2.0
4,deba32e4-0e68-4711-941b-3b63bd965afb,TCGA-A2-A3XY,Ductal and Lobular Neoplasms,,0,,,,,
...,...,...,...,...,...,...,...,...,...,...
1093,35bd694d-1dd2-466f-ab27-03320614b40e,TCGA-OL-A66I,Ductal and Lobular Neoplasms,,0,,,,,
1094,2c90d5e3-ac8c-4c86-a06f-42ccc064eb49,TCGA-D8-A1XZ,Ductal and Lobular Neoplasms,,0,,,,,
1095,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,TCGA-BH-A0AU,Ductal and Lobular Neoplasms,Luminal B,1,1.0,1.0,1.0,1.0,4.0
1096,2cf68894-168b-458b-af4f-53cad72989a8,TCGA-A7-A13F,Ductal and Lobular Neoplasms,Luminal B,1,1.0,1.0,1.0,1.0,4.0


## Defining the Cases of Interest

### Flagged Cases DataFrame

In [22]:
# Flag cases with files related to tumor tissue analysis
df_cases['has_tumor_files_of_interest'] = np.where(
    ((df_cases['is_case_of_interest'] == 1)
     & (df_cases['tumor_mir_files'] == 1)
     & (df_cases['tumor_mrna_files'] == 1)), 1, 0
)

# Flag cases with files related to normal tissue analysis
df_cases['has_normal_files_of_interest'] = np.where(
    ((df_cases['is_case_of_interest'] == 1)
     & (df_cases['normal_mir_files'] == 1)
     & (df_cases['normal_mrna_files'] == 1)), 1, 0
)

# Rearrange the DataFrame columns
df_cases = df_cases.drop(columns=[
    'tumor_mir_files',
    'tumor_mrna_files',
    'normal_mir_files',
    'normal_mrna_files',
    'total_files',
    'is_case_of_interest',
])

# Update the cases of interest flag
df_cases['is_case_of_interest'] = np.where(
    ((df_cases['has_tumor_files_of_interest'] == 1)
     | (df_cases['has_normal_files_of_interest'] == 1)), 1, 0
)

# Store the cases metadata DataFrame in a CSV file
df_cases.to_csv(BRCA_INTERIM_FILE_PATHS['cases'], index=False)

In [23]:
# Print the cases metadata DataFrame in a CSV file
df_cases

Unnamed: 0,case_id,submitter_id,disease_type,pam50_mrna,has_tumor_files_of_interest,has_normal_files_of_interest,is_case_of_interest
0,2d29a4ac-98e7-4663-9dd6-5681bc32ac2e,TCGA-A2-A3KC,Ductal and Lobular Neoplasms,,0,0,0
1,de531604-bd1a-49df-96c2-6de3ae703f1d,TCGA-D8-A1XG,Ductal and Lobular Neoplasms,,0,0,0
2,de646f6a-6d50-4be2-b3cd-f17554b234df,TCGA-AR-A24L,Ductal and Lobular Neoplasms,,0,0,0
3,de75d0b9-0f47-4732-8df5-05c350cfcd32,TCGA-AN-A0FY,Ductal and Lobular Neoplasms,Luminal B,1,0,1
4,deba32e4-0e68-4711-941b-3b63bd965afb,TCGA-A2-A3XY,Ductal and Lobular Neoplasms,,0,0,0
...,...,...,...,...,...,...,...
1093,35bd694d-1dd2-466f-ab27-03320614b40e,TCGA-OL-A66I,Ductal and Lobular Neoplasms,,0,0,0
1094,2c90d5e3-ac8c-4c86-a06f-42ccc064eb49,TCGA-D8-A1XZ,Ductal and Lobular Neoplasms,,0,0,0
1095,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,TCGA-BH-A0AU,Ductal and Lobular Neoplasms,Luminal B,1,1,1
1096,2cf68894-168b-458b-af4f-53cad72989a8,TCGA-A7-A13F,Ductal and Lobular Neoplasms,Luminal B,1,1,1


### Cases of Interest DataFrame

In [24]:
# Create the cases of interest metadata DataFrame
df_cases_of_interest = df_cases \
    .query('is_case_of_interest == 1') \
    .drop(columns=['is_case_of_interest']) \
    .reset_index(drop=True)
    
# Store the cases of interest metadata DataFrame in a CSV file
df_cases_of_interest.to_csv(
    BRCA_INTERIM_FILE_PATHS['cases-of-interest'], index=False
)

In [25]:
# Print the cases of interest metadata DataFrame
df_cases_of_interest

Unnamed: 0,case_id,submitter_id,disease_type,pam50_mrna,has_tumor_files_of_interest,has_normal_files_of_interest
0,de75d0b9-0f47-4732-8df5-05c350cfcd32,TCGA-AN-A0FY,Ductal and Lobular Neoplasms,Luminal B,1,0
1,1fef9454-b3e8-4d9d-a9aa-aa1f1a32b80c,TCGA-E2-A15J,Ductal and Lobular Neoplasms,Luminal A,1,0
2,2021ed1f-dc75-4701-b8b8-1386466e4802,TCGA-BH-A0H7,Ductal and Lobular Neoplasms,Luminal A,1,1
3,20e8106b-1290-4735-abe4-7621e08e3dc8,TCGA-BH-A0E0,Ductal and Lobular Neoplasms,Basal-like,1,0
4,214a4507-d974-4b3e-8525-7408fccc6a0f,TCGA-BH-A0B7,Ductal and Lobular Neoplasms,HER2-enriched,1,1
...,...,...,...,...,...,...
483,33919e92-4ea9-47e5-a6f4-36e51845d50f,TCGA-A8-A09N,Ductal and Lobular Neoplasms,Luminal B,1,0
484,34ab0cb8-bf19-4cbb-b4eb-3c45ee5aa60b,TCGA-B6-A0IG,Ductal and Lobular Neoplasms,Luminal A,1,0
485,360d1d6d-cf3e-4c78-b36f-ef5e3160aa6b,TCGA-BH-A0AU,Ductal and Lobular Neoplasms,Luminal B,1,1
486,2cf68894-168b-458b-af4f-53cad72989a8,TCGA-A7-A13F,Ductal and Lobular Neoplasms,Luminal B,1,1


## Defining the Files of Interest

### Flagged Files DataFrame

In [26]:
# Retrieve the flag of interest of the cases    
df_files = df_files \
    .merge(
        right=df_cases,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    .drop(columns=['submitter_id', 'disease_type', 'pam50_mrna',])

# Flag the files related to tumor tissue analysis
df_files['is_tumor_file_of_interest'] = np.where(
    ((df_files['is_file_of_interest'] == 1)
     & (df_files['has_tumor_files_of_interest'] == 1)
     & (df_files['tissue_type'] == 'Tumor')), 1, 0
)

# Flag the files related to normal tissue analysis
df_files['is_normal_file_of_interest'] = np.where(
    ((df_files['is_file_of_interest'] == 1)
     & (df_files['has_normal_files_of_interest'] == 1)
     & (df_files['tissue_type'] == 'Normal')), 1, 0
)

# Rearrange the DataFrame columns
df_files = df_files.drop(columns=[
    'sample_type',
    'tissue_type',
    'is_file_of_interest',
    'has_tumor_files_of_interest',
    'has_normal_files_of_interest',
    'is_case_of_interest',
])

# Update the files of interest flag
df_files['is_file_of_interest'] = np.where(
    ((df_files['is_tumor_file_of_interest'] == 1)
     | (df_files['is_normal_file_of_interest'] == 1)), 1, 0
)

# Store the files metadata DataFrame in a CSV file
df_files.to_csv(BRCA_INTERIM_FILE_PATHS['files'], index=False)

In [27]:
# Print the files metadata DataFrame in a CSV file
df_files

Unnamed: 0,file_id,case_id,experimental_strategy,data_type,data_format,samples,is_tumor_file_of_interest,is_normal_file_of_interest,is_file_of_interest
0,27016c1c-f696-4a54-8c4a-6f9ba8a1c9e3,3b1f7e48-216d-47de-ba79-db350f52bff0,WXS,Annotated Somatic Mutation,VCF,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",0,0,0
1,776f51f8-6f23-42d5-a96b-d7c934dcaa12,3b1f7e48-216d-47de-ba79-db350f52bff0,WXS,Aligned Reads,BAM,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",0,0,0
2,b1680590-4e97-4a3d-b41b-d9e84290de17,3b1f7e48-216d-47de-ba79-db350f52bff0,RNA-Seq,Aligned Reads,BAM,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",0,0,0
3,65cab127-616b-45c7-98b4-fdf8e01881fb,3b1f7e48-216d-47de-ba79-db350f52bff0,WGS,Aligned Reads,BAM,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}]",0,0,0
4,a0409565-1baf-46f4-bae9-85a1dbe34160,3b1f7e48-216d-47de-ba79-db350f52bff0,,Biospecimen Supplement,BCR XML,,0,0,0
...,...,...,...,...,...,...,...,...,...
89800,94468afc-faea-4091-af0f-d2df8aec37da,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,RNA-Seq,Gene Expression Quantification,TSV,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",0,0,0
89801,9bc60601-f424-492f-acbc-38751264d066,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,WGS,Raw Simple Somatic Mutation,VCF,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",0,0,0
89802,84b47afd-7fb1-4a52-8328-c257a7676176,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,WXS,Annotated Somatic Mutation,VCF,"[{'sample_type': 'Blood Derived Normal', 'tissue_type': 'Normal'}, {'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",0,0,0
89803,6d870c48-0176-4137-96b9-7f5bb02100d2,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,miRNA-Seq,Aligned Reads,BAM,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",0,0,0


### Files of Interest DataFrame

In [28]:
# Create the files of interest metadata DataFrame
df_files_of_interest = df_files \
    .query('is_file_of_interest == 1') \
    .drop(columns=['is_file_of_interest']) \
    .reset_index(drop=True)
    
# Store the files of interest metadata DataFrame in a CSV file
df_files_of_interest.to_csv(
    BRCA_INTERIM_FILE_PATHS['files-of-interest'], index=False
)

In [29]:
# Print the files of interest metadata DataFrame 
df_files_of_interest

Unnamed: 0,file_id,case_id,experimental_strategy,data_type,data_format,samples,is_tumor_file_of_interest,is_normal_file_of_interest
0,dd0e32d6-4704-4abd-82e4-2041fbc19a79,3c08aabd-d5b5-4bbe-857c-38a7527b2163,miRNA-Seq,Isoform Expression Quantification,TXT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
1,8644d07c-979f-4ec6-928a-afe9229fb9f1,3c08aabd-d5b5-4bbe-857c-38a7527b2163,RNA-Seq,Gene Expression Quantification,TSV,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
2,09a491b5-1bfb-47d9-963f-69f6c7e6c1e5,abdc76db-f85e-4337-a57e-6d098789da03,RNA-Seq,Gene Expression Quantification,TSV,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
3,02fc7dd7-7e17-4930-8424-52173c33b29b,abdc76db-f85e-4337-a57e-6d098789da03,miRNA-Seq,Isoform Expression Quantification,TXT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
4,f8e5691a-e24d-4dba-873e-9ffa3ca12bb6,3c612e12-6de8-44fa-a095-805c45474821,RNA-Seq,Gene Expression Quantification,TSV,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
...,...,...,...,...,...,...,...,...
1079,863c79a7-57d7-4a02-89ab-4abe5db7e4ff,6cdc0d53-f813-4101-81e5-9bee68270536,miRNA-Seq,Isoform Expression Quantification,TXT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
1080,24d0d1a9-ce92-4db6-8224-e3dd67733712,10c9be1e-2cc0-45e4-8d2c-c013ea63ef05,miRNA-Seq,Isoform Expression Quantification,TXT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
1081,15d7e135-acb7-46ef-a8b1-f424e40a9e4d,10c9be1e-2cc0-45e4-8d2c-c013ea63ef05,RNA-Seq,Gene Expression Quantification,TSV,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
1082,8ecef704-ed0f-4700-a34e-46a174033332,1174f6e4-ffbe-4e59-a000-8d861c968369,miRNA-Seq,Isoform Expression Quantification,TXT,"[{'sample_type': 'Primary Tumor', 'tissue_type': 'Tumor'}]",1,0
