# **Exploratory Data Analysis of TCGA-BRCA Project**
TCGA: The Cancer Genome Atlas

# Importing Libraries

In [1]:
import pandas as pd

# Constants and Paths

In [2]:
# Interim data folder path
INTERIM_DATA_PATH = '../../data/interim/tcga-brca'

# Loading and Preparing Data

In [3]:
# DataFrame with the cases of interest with tumor tissue analysis
df_tumor_cases = pd.read_csv(
    f'{INTERIM_DATA_PATH}/brca-api-cases-tumor-analysis.csv'
)

# DataFrame with the cases of interest with normal tissue analysis
df_normal_cases = pd.read_csv(
    f'{INTERIM_DATA_PATH}/brca-api-cases-normal-analysis.csv'
)

# DataFrame with the files of interest associated with tumor tissue analysis
df_tumor_files = pd.read_csv(
    f'{INTERIM_DATA_PATH}/brca-api-files-tumor-analysis.csv'
)

# DataFrame with the files of interest associated with normal tissue analysis
df_normal_files = pd.read_csv(
    f'{INTERIM_DATA_PATH}/brca-api-files-normal-analysis.csv'
)

In [4]:
# DataFrame with cases and files associated with tumor tissue analysis
df_tumor_data = df_tumor_cases \
    .merge(
        right=df_tumor_files,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    .drop(columns=['files', 'tissue_type', 'sample_type'])

# DataFrame with cases and files associated with normal tissue analysis
df_normal_data = df_normal_cases \
    .merge(
        right=df_normal_files,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    .drop(columns=['files', 'tissue_type', 'sample_type'])

# DataFrame that concatenates all cases associated with BRCA
df_cases = pd.concat([df_tumor_cases, df_normal_cases]) \
    .drop_duplicates('case_id') \
    .reset_index(drop=True)

# DataFrame that concatenates all files associated with BRCA
df_files = pd.concat([df_tumor_files, df_normal_files]) \
    .drop_duplicates('case_id') \
    .reset_index(drop=True)

# Tumor Tissue

In [5]:
# Print the total number of tumor related cases and files
print(
    f'Total number of cases: {df_tumor_cases.shape[0]}\n'
    + f'Total number of files: {df_tumor_files.shape[0]}'
)

Total number of cases: 495
Total number of files: 990


## Cases

In [6]:
# Count the total number of disease types associated with cases
df_tumor_cases \
    .groupby('disease_type') \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,case_count
disease_type,Unnamed: 1_level_1
Ductal and Lobular Neoplasms,486
Complex Epithelial Neoplasms,2
"Cystic, Mucinous and Serous Neoplasms",2
"Epithelial Neoplasms, NOS",2
Adenomas and Adenocarcinomas,1
Adnexal and Skin Appendage Neoplasms,1
Squamous Cell Neoplasms,1


In [7]:
# Count the total number of molecular subtypes associated with cases
df_tumor_cases \
    .groupby('pam50_mrna') \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,case_count
pam50_mrna,Unnamed: 1_level_1
Luminal A,226
Luminal B,122
Basal-like,90
HER2-enriched,57


In [8]:
# Count the total number of molecular subtypes and disease types associated with cases
df_tumor_cases \
    .groupby(['pam50_mrna', 'disease_type']) \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,case_count
pam50_mrna,disease_type,Unnamed: 2_level_1
Luminal A,Ductal and Lobular Neoplasms,223
Luminal B,Ductal and Lobular Neoplasms,120
Basal-like,Ductal and Lobular Neoplasms,87
HER2-enriched,Ductal and Lobular Neoplasms,56
Basal-like,Complex Epithelial Neoplasms,2
Luminal B,"Cystic, Mucinous and Serous Neoplasms",2
Basal-like,"Epithelial Neoplasms, NOS",1
HER2-enriched,Adnexal and Skin Appendage Neoplasms,1
Luminal A,Adenomas and Adenocarcinomas,1
Luminal A,"Epithelial Neoplasms, NOS",1


## Files

In [9]:
# Count the total number of data formats associated with cases
df_tumor_files \
    .groupby(['experimental_strategy', 'data_type', 'data_format']) \
    .agg(file_count = pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by='file_count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,file_count
experimental_strategy,data_type,data_format,Unnamed: 3_level_1
RNA-Seq,Gene Expression Quantification,TSV,495
miRNA-Seq,miRNA Expression Quantification,TXT,495


# Normal Tissue

## Cases

In [10]:
# Count the total number of disease types associated with cases
df_normal_cases \
    .groupby('disease_type') \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,case_count
disease_type,Unnamed: 1_level_1
Ductal and Lobular Neoplasms,56
Complex Epithelial Neoplasms,1


In [11]:
# Count the total number of molecular subtypes associated with cases
df_normal_cases \
    .groupby('pam50_mrna') \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,case_count
pam50_mrna,Unnamed: 1_level_1
Luminal A,29
Luminal B,14
Basal-like,9
HER2-enriched,5


In [12]:
# Count the total number of molecular subtypes and disease types associated with cases
df_normal_cases \
    .groupby(['pam50_mrna', 'disease_type']) \
    .agg(case_count = pd.NamedAgg(column='case_id', aggfunc='nunique')) \
    .sort_values(by='case_count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,case_count
pam50_mrna,disease_type,Unnamed: 2_level_1
Luminal A,Ductal and Lobular Neoplasms,29
Luminal B,Ductal and Lobular Neoplasms,14
Basal-like,Ductal and Lobular Neoplasms,8
HER2-enriched,Ductal and Lobular Neoplasms,5
Basal-like,Complex Epithelial Neoplasms,1


## Files

In [13]:
# Count the total number of data formats associated with files
df_normal_files \
    .groupby(['experimental_strategy', 'data_type', 'data_format']) \
    .agg(file_count = pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by='file_count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,file_count
experimental_strategy,data_type,data_format,Unnamed: 3_level_1
RNA-Seq,Gene Expression Quantification,TSV,57
miRNA-Seq,miRNA Expression Quantification,TXT,57
