# **Exploratory Metadata Analysis of TCGA-BRCA**
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import os
import sys

import pandas as pd

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import BRCA_PROCESSED_FILES_PATHS

# Loading and Preparing Data

In [None]:
# Dataframe with cases metadata
df_cases = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['cases'])

# Dataframe with cases of interest metadata
df_cases_of_interest = df_cases.query('is_case_of_interest == 1')

# Dataframe with files metadata
df_files = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['files'])

# Dataframe with files of interest metadata
df_files_of_interest = df_files.query('is_file_of_interest == 1')

# DataFrame with project metadata
df_project = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['project'])

# DataFrame with files and cases
df_files_and_cases = df_files \
    .merge(
        right=df_cases,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    )

# DataFrame with files and cases of interest
df_files_and_cases_of_interest = df_files_of_interest \
    .merge(
        right=df_cases_of_interest,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    )

# Project Exploration

In [3]:
# Print the TCGA-BRCA project metadata DataFrame
pd.set_option('display.max_colwidth', 900)
df_project

Unnamed: 0,project_id,project_name,primary_site,disease_type,experimental_strategies,data_categories,case_count,file_count
0,TCGA-BRCA,Breast Invasive Carcinoma,['Breast'],"['Epithelial Neoplasms, NOS', 'Adnexal and Skin Appendage Neoplasms', 'Squamous Cell Neoplasms', 'Adenomas and Adenocarcinomas', 'Complex Epithelial Neoplasms', 'Fibroepithelial Neoplasms', 'Cystic, Mucinous and Serous Neoplasms', 'Basal Cell Neoplasms', 'Ductal and Lobular Neoplasms']","[{'file_count': 11079, 'case_count': 1095, 'experimental_strategy': 'RNA-Seq'}, {'file_count': 17049, 'case_count': 1072, 'experimental_strategy': 'WXS'}, {'file_count': 3621, 'case_count': 1079, 'experimental_strategy': 'miRNA-Seq'}, {'file_count': 10572, 'case_count': 952, 'experimental_strategy': 'WGS'}, {'file_count': 75, 'case_count': 74, 'experimental_strategy': 'ATAC-Seq'}, {'file_count': 14329, 'case_count': 1098, 'experimental_strategy': 'Genotyping Array'}, {'file_count': 3714, 'case_count': 1097, 'experimental_strategy': 'Methylation Array'}, {'file_count': 919, 'case_count': 881, 'experimental_strategy': 'Reverse Phase Protein Array'}, {'file_count': 1133, 'case_count': 1062, 'experimental_strategy': 'Diagnostic Slide'}, {'file_count': 1978, 'case_count': 1093, 'experimental_strategy': 'Tissue Slide'}]","[{'file_count': 19753, 'case_count': 1098, 'data_category': 'Simple Nucleotide Variation'}, {'file_count': 9282, 'case_count': 1098, 'data_category': 'Sequencing Reads'}, {'file_count': 5316, 'case_count': 1098, 'data_category': 'Biospecimen'}, {'file_count': 2288, 'case_count': 1098, 'data_category': 'Clinical'}, {'file_count': 14346, 'case_count': 1098, 'data_category': 'Copy Number Variation'}, {'file_count': 4876, 'case_count': 1097, 'data_category': 'Transcriptome Profiling'}, {'file_count': 3714, 'case_count': 1097, 'data_category': 'DNA Methylation'}, {'file_count': 919, 'case_count': 881, 'data_category': 'Proteome Profiling'}, {'file_count': 2696, 'case_count': 784, 'data_category': 'Somatic Structural Variation'}, {'file_count': 5772, 'case_count': 1098, 'data_category': 'Structural Variation'}]",1098,68962


# Cases Exploration

## All Cases

In [4]:
# Count the number of distinct cases
total_cases = df_cases['case_id'].nunique()
print(f'Number of distinct cases: {total_cases}')

Number of distinct cases: 1098


In [5]:
# Count the number of cases associated with each disease type
df_cases \
    .groupby('disease_type') \
    .agg(cases = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='cases', ascending=False)

Unnamed: 0_level_0,cases
disease_type,Unnamed: 1_level_1
Ductal and Lobular Neoplasms,1054
"Cystic, Mucinous and Serous Neoplasms",16
Complex Epithelial Neoplasms,14
"Epithelial Neoplasms, NOS",5
Adenomas and Adenocarcinomas,3
Fibroepithelial Neoplasms,2
Squamous Cell Neoplasms,2
Adnexal and Skin Appendage Neoplasms,1
Basal Cell Neoplasms,1


In [6]:
# Count the number of cases associated with each molecular subtype
df_cases \
    .groupby('pam50_mrna') \
    .agg(cases = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='cases', ascending=False)

Unnamed: 0_level_0,cases
pam50_mrna,Unnamed: 1_level_1
Luminal A,231
Luminal B,127
Basal-like,98
HER2-enriched,58


In [7]:
# Count the number of cases associated with each subtype and disease
df_cases \
    .groupby(['pam50_mrna', 'disease_type']) \
    .agg(cases = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by=['pam50_mrna', 'cases'], ascending=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,cases
pam50_mrna,disease_type,Unnamed: 2_level_1
Basal-like,"Epithelial Neoplasms, NOS",1
Basal-like,Complex Epithelial Neoplasms,2
Basal-like,Ductal and Lobular Neoplasms,95
HER2-enriched,Adnexal and Skin Appendage Neoplasms,1
HER2-enriched,Ductal and Lobular Neoplasms,57
Luminal A,Adenomas and Adenocarcinomas,1
Luminal A,"Epithelial Neoplasms, NOS",1
Luminal A,Squamous Cell Neoplasms,1
Luminal A,Ductal and Lobular Neoplasms,228
Luminal B,"Cystic, Mucinous and Serous Neoplasms",2


## Cases of Interest

In [8]:
# Count the number of distinct cases
total_cases = df_cases_of_interest['case_id'].nunique()
print(f'Number of distinct cases of interest: {total_cases}')

Number of distinct cases of interest: 488


In [9]:
# Count the number of cases associated with each disease type
df_cases_of_interest \
    .groupby('disease_type') \
    .agg(cases = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='cases', ascending=False)

Unnamed: 0_level_0,cases
disease_type,Unnamed: 1_level_1
Ductal and Lobular Neoplasms,488


In [10]:
# Count the number of cases associated with each molecular subtype
df_cases_of_interest \
    .groupby('pam50_mrna') \
    .agg(
        cases = pd.NamedAgg(column='case_id', aggfunc='nunique'),
        tumor_cases = pd.NamedAgg(
            column='has_tumor_files_of_interest', aggfunc='sum'
        ),
        normal_cases = pd.NamedAgg(
            column='has_normal_files_of_interest', aggfunc='sum'
        ),
    ) \
    .sort_values(by='cases', ascending=False)

Unnamed: 0_level_0,cases,tumor_cases,normal_cases
pam50_mrna,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Luminal A,224,223,29
Luminal B,120,120,14
Basal-like,88,87,8
HER2-enriched,56,56,5


In [11]:
# Count the number of cases associated with tumor tissue analysis
total_cases = df_cases_of_interest['has_tumor_files_of_interest'].sum()
print(f'Number of cases associated with tumor tissue analysis: {total_cases}')

Number of cases associated with tumor tissue analysis: 486


In [12]:
# Count the number of cases associated with normal tissue analysis
total_cases = df_cases_of_interest['has_normal_files_of_interest'].sum()
print(f'Number of cases associated with paired normal tissue analysis: {total_cases}')

Number of cases associated with paired normal tissue analysis: 56


# Files Exploration

## All Files

In [13]:
# Count the number of distinct files
total_files = df_files['file_id'].nunique()
print(f'Number of distinct files: {total_files}')

Number of distinct files: 68962


In [14]:
# Count the number of files associated with each experimental strategy
df_files \
    .groupby('experimental_strategy') \
    .agg(files = pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by='files', ascending=False)

Unnamed: 0_level_0,files
experimental_strategy,Unnamed: 1_level_1
WXS,17049
Genotyping Array,14329
RNA-Seq,11079
WGS,10572
Methylation Array,3714
miRNA-Seq,3621
Tissue Slide,1978
Diagnostic Slide,1133
Reverse Phase Protein Array,919
ATAC-Seq,75


In [15]:
# Count the number of data types associated with files
df_files \
    .groupby([
        'experimental_strategy', 'access', 'data_type', 'data_format',
    ]) \
    .agg(files = pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by=['experimental_strategy', 'access', 'files'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,files
experimental_strategy,access,data_type,data_format,Unnamed: 4_level_1
ATAC-Seq,controlled,Aligned Reads,BAM,75
Diagnostic Slide,open,Slide Image,SVS,1133
Genotyping Array,controlled,Raw Intensities,CEL,2263
Genotyping Array,controlled,Simple Germline Variation,TSV,2263
Genotyping Array,open,Allele-specific Copy Number Segment,TXT,2144
Genotyping Array,open,Copy Number Segment,TXT,2229
Genotyping Array,open,Masked Copy Number Segment,TXT,2229
Genotyping Array,open,Gene Level Copy Number,TSV,3201
Methylation Array,open,Methylation Beta Value,TXT,1238
Methylation Array,open,Masked Intensities,IDAT,2476


## Files of Interest

In [16]:
# Count the number of distinct files
total_files = df_files_of_interest['file_id'].nunique()
print(f'Number of distinct files: {total_files}')

Number of distinct files: 1084


In [17]:
# Count the number of files associated with each experimental strategy
df_files_of_interest \
    .groupby('experimental_strategy') \
    .agg(
        files = pd.NamedAgg(column='file_id', aggfunc='nunique'),
        tumor_files = pd.NamedAgg(
            column='is_tumor_file_of_interest', aggfunc='sum'
        ),
        normal_files = pd.NamedAgg(
            column='is_normal_file_of_interest', aggfunc='sum'
        ),
    ) \
    .sort_values(by='files', ascending=False)

Unnamed: 0_level_0,files,tumor_files,normal_files
experimental_strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RNA-Seq,542,486,56
miRNA-Seq,542,486,56


In [18]:
# Count the number of files associated with tumor tissue analysis
total_files = df_files_of_interest['is_tumor_file_of_interest'].sum()
print(f'Number of files associated with tumor tissue analysis: {total_files}')

Number of files associated with tumor tissue analysis: 972


In [19]:
# Count the number of files associated with normal tissue analysis
total_files = df_files_of_interest['is_normal_file_of_interest'].sum()
print(f'Number of files associated with paired normal tissue analysis: {total_files}')

Number of files associated with paired normal tissue analysis: 112


In [20]:
# Count the number of data types associated with files
df_files_of_interest \
    .groupby([
        'experimental_strategy', 'access', 'data_type', 'data_format',
    ]) \
    .agg(files = pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by=['experimental_strategy', 'access', 'files'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,files
experimental_strategy,access,data_type,data_format,Unnamed: 4_level_1
RNA-Seq,open,Gene Expression Quantification,TSV,542
miRNA-Seq,open,Isoform Expression Quantification,TXT,542
