# **Processing Files of Interest from TCGA-BRCA Project**
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [None]:
import os
import sys

import numpy as np
import pandas as pd

# Get the project root (two levels above)
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    PROCESSED_DATA_DIR,
    BRCA_RAW_FILES_DIRS,  
)

# Functions

In [None]:
def mirna_seq_files_processing(path):
    # List the miRNA-Seq files contained in the folder
    files = [f for f in os.listdir(path) if f.startswith('mirna-seq_')]

    # Retrieve the file id of the first file
    file_id = files[0].replace('mirna-seq_', '')
    file_id = file_id.replace('.txt', '')

    # Initialize DataFrames with raw and normalized reads of the first file
    df_mir_reads = pd.read_csv(f'{path}/{files[0]}', sep='\t')
    
    # columns = ['genome_assembly', 'chromosome', 'genomic_coordinates', 'strand']
    # df_mir_reads[columns] =  \
    #     df_mir_reads['isoform_coords'].str.split(':', 3, expand=True)
    
    df_mir_reads[['miRNA_region_name', 'miRNA_region_id']] = \
        df_mir_reads['miRNA_region'].str.split(',', 1, expand=True)
        
    df_mir_reads = df_mir_reads \
        .groupby('miRNA_region_id') \
        .agg(
            raw_reads = pd.NamedAgg(column='read_count', aggfunc='sum'),
            norm_reads = pd.NamedAgg(column='reads_per_million_miRNA_mapped', aggfunc='sum')
        ) \
        .reset_index()
        
    df_mir_raw_reads = df_mir_reads[['miRNA_region_id', 'raw_reads']] \
        .rename(columns={'raw_reads': file_id})
        
    df_mir_norm_reads = df_mir_reads[['miRNA_region_id', 'norm_reads']] \
        .rename(columns={'norm_reads': file_id})
    
    # df_mir_raw_reads = df_mir_raw_reads \
    #     [['miRNA_ID', 'read_count']] \
    #     .rename(columns={'read_count': file_id})
        
    # Transform each file read count into a new column of the DataFrame
    for file in files[1:]:
        # Retrieve the file id
        file_id = file.replace('mirna-seq_', '')
        file_id = file_id.replace('.txt', '')

        # Read the file and rename the read count column
        df_temp = pd.read_csv(f'{path}/{file}', sep='\t')
        df_temp = df_temp \
            [['miRNA_ID', 'read_count']] \
            .rename(columns={'read_count': file_id})

        # Add the file read count as a new DataFrame column
        df_mir_raw_reads = df_mir_raw_reads \
            .merge(
                right=df_temp,
                left_on='miRNA_ID',
                right_on='miRNA_ID',
                how='outer'
            )
    
    return df_mir_raw_reads, df_mir_norm_reads

In [3]:
def rna_seq_files_processing(path):
    # List the RNA-Seq files contained in the folder
    files = [f for f in os.listdir(path) if f.startswith('rna-seq_')]

    # Retrieve the file id of the first file
    file_id = files[0].replace('rna-seq_', '')
    file_id = file_id.replace('.tsv', '')

    # Initialize DataFrames with raw and normalized reads of the first file
    df_mrna_raw_reads = pd.read_csv(f'{path}/{files[0]}', sep='\t', skiprows=1)
    df_mrna_raw_reads = df_mrna_raw_reads \
        .query('gene_type == "protein_coding"') \
        .reset_index(drop=True)
    
    df_mrna_norm_reads = df_mrna_raw_reads \
        [['gene_id', 'gene_name', 'tpm_unstranded']] \
        .rename(columns={'tpm_unstranded': file_id})
    
    df_mrna_raw_reads = df_mrna_raw_reads \
        [['gene_id', 'gene_name', 'unstranded']] \
        .rename(columns={'unstranded': file_id})
    
    # Transform each file read count into a new column of the DataFrame
    for file in files[1:]:
        # Retrieve the file id
        file_id = file.replace('rna-seq_', '')
        file_id = file_id.replace('.tsv', '')

        # Read the file and rename the read count column
        df_temp_raw_reads = pd.read_csv(f'{path}/{file}', sep='\t', skiprows=1)
        df_temp_raw_reads = df_temp_raw_reads \
            .query('gene_type == "protein_coding"')
            
        df_temp_norm_reads = df_temp_raw_reads \
            [['gene_id', 'tpm_unstranded']] \
            .rename(columns={'tpm_unstranded': file_id})
    
        df_temp_raw_reads = df_temp_raw_reads \
            [['gene_id', 'unstranded']] \
            .rename(columns={'unstranded': file_id})

        # Add the raw read count as a new DataFrame column
        df_mrna_raw_reads = df_mrna_raw_reads \
            .merge(
                right=df_temp_raw_reads,
                left_on='gene_id',
                right_on='gene_id',
                how='inner'
            )
            
        # Add the file read count as a new DataFrame column
        df_mrna_norm_reads = df_mrna_norm_reads \
            .merge(
                right=df_temp_norm_reads,
                left_on='gene_id',
                right_on='gene_id',
                how='inner'
            )
        
    return df_mrna_raw_reads, df_mrna_norm_reads

In [4]:
def files_processing(files_path, files_prefix):
    # Process the miRNA-Seq files related to the cohort of interest
    df_mir_reads = mirna_seq_files_processing(files_path)

    # Store the DataFrames of the processed miRNA-Seq files into CSV files
    file_name = f'{files_prefix}-mirna-reads.csv'
    df_mir_reads.to_csv(f'{PROCESSED_DATA_DIR}/{file_name}', index=False)
    
    # Process the RNA-Seq files related to the cohort of interest
    df_rna_reads = rna_seq_files_processing(files_path)

    # Store the DataFrames of the processed RNA-Seq files into CSV files
    file_name = f'{files_prefix}-rna-reads.csv'
    df_rna_reads.to_csv(f'{PROCESSED_DATA_DIR}/{file_name}', index=False)
    
    return df_mir_reads, df_rna_reads

# Tumor Tissue Analysis Files

## Basal-like

In [10]:
df_raw = mirna_seq_files_processing(BRCA_RAW_FILES_DIRS['basal'])

In [11]:
df_raw

Unnamed: 0,miRNA_region_id,raw_reads,norm_reads
0,MIMAT0000062,15691,15553.368251
1,MIMAT0000063,23004,22802.223128
2,MIMAT0000064,2375,2354.167968
3,MIMAT0000065,242,239.877328
4,MIMAT0000066,798,791.000438
...,...,...,...
636,MIMAT0031177,3,2.973686
637,MIMAT0031890,1,0.991229
638,MIMAT0031893,3,2.973687
639,MIMAT0032026,1,0.991229


In [None]:
df_raw, df_norm = rna_seq_files_processing(BRCA_RAW_FILES_DIRS['basal'])

In [None]:
df_raw

In [None]:
df_norm

In [None]:
# Process Basal-like related miRNA-Seq and RNA-Seq files
df_mir_basal_reads, df_rna_basal_reads = files_processing(
    files_path=BRCA_RAW_FILES_DIRS['basal'], files_prefix='basal-like'
)

In [None]:
# Print the DataFrame of processed Basal-like microRNA reads
df_mir_basal_reads

In [None]:
# Print the DataFrame of processed Basal-like gene reads
df_rna_basal_reads

## HER2-enriched

In [None]:
# Process HER2-enriched related miRNA-Seq and RNA-Seq files
df_mir_her2_reads, df_rna_her2_reads = files_processing(
    files_path=BRCA_RAW_FILES_DIRS['her2'], files_prefix='her2-enriched'
)

In [None]:
# Print the DataFrame of processed HER2-enriched microRNA reads
df_mir_her2_reads

In [None]:
# Print the DataFrame of processed HER2-enriched gene reads
df_rna_her2_reads

## Luminal A

In [None]:
# Process Luminal A related miRNA-Seq and RNA-Seq files
df_mir_luma_reads, df_rna_luma_reads = files_processing(
    files_path=BRCA_RAW_FILES_DIRS['lum_a'], files_prefix='luminal-a'
)

In [None]:
# Print the DataFrame of processed Luminal A microRNA reads
df_mir_luma_reads

In [None]:
# Print the DataFrame of processed Luminal A gene reads
df_rna_luma_reads

## Luminal B

In [None]:
# Process Luminal B related miRNA-Seq and RNA-Seq files
df_mir_lumb_reads, df_rna_lumb_reads = files_processing(
    files_path=BRCA_RAW_FILES_DIRS['lum_b'], files_prefix='luminal-b'
)

In [None]:
# Print the DataFrame of processed Luminal A microRNA reads
df_mir_luma_reads

In [None]:
# Print the DataFrame of processed Luminal B gene reads
df_rna_lumb_reads

# Normal Tissue Analysis Files

In [None]:
# Process normal tissue related miRNA-Seq and RNA-Seq files
df_mir_normal_reads, df_rna_normal_reads = files_processing(
    files_path=BRCA_RAW_FILES_DIRS['normal'], files_prefix='normal-tissue'
)

In [None]:
# Print the DataFrame of processed normal tissue microRNA reads
df_mir_normal_reads

In [None]:
# Print the DataFrame of processed normal tissue gene reads
df_rna_normal_reads