# **Exploratory Metadata Analysis of TCGA-BRCA Project**
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import os
import sys

import pandas as pd

# Get the project root (two levels above)
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config.paths import BRCA_INTERIM_DATA_DIR

# Loading and Preparing Data

In [2]:
# DataFrame with the cases of interest with tumor tissue analysis
df_tumor_cases = pd.read_csv(
    f'{BRCA_INTERIM_DATA_DIR}/brca-api-cases-tumor-analysis.csv'
)

# DataFrame with the cases of interest with normal tissue analysis
df_normal_cases = pd.read_csv(
    f'{BRCA_INTERIM_DATA_DIR}/brca-api-cases-normal-analysis.csv'
)

# DataFrame with the files of interest associated with tumor tissue analysis
df_tumor_files = pd.read_csv(
    f'{BRCA_INTERIM_DATA_DIR}/brca-api-files-tumor-analysis.csv'
)

# DataFrame with the files of interest associated with normal tissue analysis
df_normal_files = pd.read_csv(
    f'{BRCA_INTERIM_DATA_DIR}/brca-api-files-normal-analysis.csv'
)

In [3]:
# DataFrame with cases and files associated with tumor tissue analysis
df_tumor_data = df_tumor_cases \
    .merge(
        right=df_tumor_files,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    .drop(columns=['files', 'tissue_type', 'sample_type'])

# DataFrame with cases and files associated with normal tissue analysis
df_normal_data = df_normal_cases \
    .merge(
        right=df_normal_files,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    .drop(columns=['files', 'tissue_type', 'sample_type'])

# DataFrame that concatenates all cases associated with BRCA
df_cases = pd.concat([df_tumor_cases, df_normal_cases]) \
    .drop_duplicates('case_id') \
    .reset_index(drop=True)

# DataFrame that concatenates all files associated with BRCA
df_files = pd.concat([df_tumor_files, df_normal_files]) \
    .drop_duplicates('case_id') \
    .reset_index(drop=True)

# Tumor Tissue

In [4]:
# Print the total number of tumor related cases and files
print(
    f'Total number of cases: {df_tumor_cases.shape[0]}\n'
    + f'Total number of files: {df_tumor_files.shape[0]}'
)

Total number of cases: 486
Total number of files: 972


## Cases

In [5]:
# Count the total number of disease types associated with cases
df_tumor_cases \
    .groupby('disease_type') \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,case_count
disease_type,Unnamed: 1_level_1
Ductal and Lobular Neoplasms,486


In [6]:
# Count the total number of molecular subtypes associated with cases
df_tumor_cases \
    .groupby('pam50_mrna') \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,case_count
pam50_mrna,Unnamed: 1_level_1
Luminal A,223
Luminal B,120
Basal-like,87
HER2-enriched,56


In [7]:
# Count the total number of molecular subtypes and disease types associated with cases
df_tumor_cases \
    .groupby(['pam50_mrna', 'disease_type']) \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,case_count
pam50_mrna,disease_type,Unnamed: 2_level_1
Luminal A,Ductal and Lobular Neoplasms,223
Luminal B,Ductal and Lobular Neoplasms,120
Basal-like,Ductal and Lobular Neoplasms,87
HER2-enriched,Ductal and Lobular Neoplasms,56


## Files

In [8]:
# Count the total number of data formats associated with cases
df_tumor_files \
    .groupby(['experimental_strategy', 'data_type', 'data_format']) \
    .agg(file_count = pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by='file_count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,file_count
experimental_strategy,data_type,data_format,Unnamed: 3_level_1
RNA-Seq,Gene Expression Quantification,TSV,486
miRNA-Seq,Isoform Expression Quantification,TXT,486


# Normal Tissue

## Cases

In [9]:
# Count the total number of disease types associated with cases
df_normal_cases \
    .groupby('disease_type') \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,case_count
disease_type,Unnamed: 1_level_1
Ductal and Lobular Neoplasms,56


In [10]:
# Count the total number of molecular subtypes associated with cases
df_normal_cases \
    .groupby('pam50_mrna') \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,case_count
pam50_mrna,Unnamed: 1_level_1
Luminal A,29
Luminal B,14
Basal-like,8
HER2-enriched,5


In [11]:
# Count the total number of molecular subtypes and disease types associated with cases
df_normal_cases \
    .groupby(['pam50_mrna', 'disease_type']) \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,case_count
pam50_mrna,disease_type,Unnamed: 2_level_1
Luminal A,Ductal and Lobular Neoplasms,29
Luminal B,Ductal and Lobular Neoplasms,14
Basal-like,Ductal and Lobular Neoplasms,8
HER2-enriched,Ductal and Lobular Neoplasms,5


## Files

In [12]:
# Count the total number of data formats associated with files
df_normal_files \
    .groupby(['experimental_strategy', 'data_type', 'data_format']) \
    .agg(file_count = pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by='file_count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,file_count
experimental_strategy,data_type,data_format,Unnamed: 3_level_1
RNA-Seq,Gene Expression Quantification,TSV,56
miRNA-Seq,Isoform Expression Quantification,TXT,56
