# **Retrieval and Aggregation of Expressions from the TCGA-BRCA Files of Interest**

This notebook produces the AT_EF family of artifacts 

- TCGA: The Cancer Genome Atlas  
- BRCA: Breast Invasive Carcinoma

# Import Libraries and Configurations

In [1]:
import logging
import os
import sys
from time import sleep

import numpy as np
import pandas as pd
import requests

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..')))

from config import (
    EXPRESSION_FILES,
    EXPRESSION_RETRIEVAL_AND_AGGREGATION_SETUP,
    GDC_API_ENDPOINTS,
    TCGA_DATA_DIRS,
    TCGA_FILES,
)

# Expression Retrieval

## Function

In [2]:
def gdc_api_file_download(df_files, group, retries=5, delay=5):
    """
    Download files related to a specified group from the GDC (Genomic Data Commons) 
    API with retry logic.

    This function filters a DataFrame of file metadata for a given group, constructs
    appropriate file names and paths, and downloads each file from the GDC API data
    endpoint. It includes retry logic to handle transient download or write errors.

    Parameters
    ----------
    df_files : pandas.DataFrame
        DataFrame containing file metadata with columns:
        'file_id', 'experimental_strategy', 'data_format', and 'group'.
    group : str
        The group name to filter files by (e.g., 'Luminal B').
    retries : int, optional
        Number of attempts to retry downloading a file on failure. Default is 5.
    delay : int or float, optional
        Delay in seconds between retry attempts. Default is 5.

    Returns
    -------
    None
        Files are saved directly to disk in the directory specified by
        `TCGA_DATA_DIRS['raw'][group_dir]`.

    Raises
    ------
    Logs errors related to download or file write failures but does not raise exceptions.

    Examples
    --------
    >>> gdc_api_file_download(df_files, group='Basal-like')
    # Downloads and saves files belonging to the 'Basal-like' group.
    """
    # Select all files related to the group
    files = df_files \
        .query(f'group == "{group}"') \
        .to_dict(orient='records')

    # Raw directory path to write the files
    group_dir = (group.lower()).replace(' ', '-')
    dir_path = TCGA_DATA_DIRS['raw'][group_dir]

    # Download each file of interest
    for file in files:
        # Define all file metadata
        file_id = file['file_id']
        file_type = file['experimental_strategy'].lower()
        file_format = file['data_format'].lower()
        file_name = f'{file_type}_{file_id}.{file_format}'
        file_path = os.path.join(dir_path, file_name)

        # Retry logic
        for attempt in range(retries):
            try:
                # Request the file donwload
                logging.info(f'Downloading file {file_id} [attempt {attempt + 1}]...')
                response = requests.get(
                    url=os.path.join(GDC_API_ENDPOINTS['data'], file_id),
                    headers={'Content-Type': 'application/json'},
                    timeout=30
                )
                response.raise_for_status()

                # Write the file in the group raw data directory
                with open(file_path, 'wb') as output_file:
                    output_file.write(response.content)

                # Exit retry loop on success
                logging.info(f'Downloaded and saved to {file_path}')
                break

            except requests.exceptions.RequestException as req_err:
                logging.error(f'Request failed: {req_err}')
            except OSError as os_err:
                logging.error(f'File write failed: {os_err}')
                break
            except Exception as e:
                logging.error(f'Unexpected error: {e}')

            if attempt < retries - 1:
                logging.info(f'Retrying in {delay} seconds...')
                sleep(delay)
            else:
                logging.warning(f'Failed to download the file after {retries} attempts')

## Load and Prepare Metadata

In [3]:
# Create a DataFrame with the cases of interest
dir_path = TCGA_DATA_DIRS['processed']['root']
file_name = TCGA_FILES['cases']
df_cases = pd.read_csv(os.path.join(dir_path, file_name)) \
    .query('is_case_of_interest == 1')
    
# Create a DataFrame with the files of interest
dir_path = TCGA_DATA_DIRS['processed']['root']
file_name = TCGA_FILES['files']
df_files = pd.read_csv(os.path.join(dir_path, file_name)) \
    .query('is_file_of_interest == 1')

# Create a DataFrame with files and cases of interest
df_files_with_group = df_files \
    .merge(
        right=df_cases,
        left_on='case_id',
        right_on='case_id',
        how='inner',
    )

# Characterizes the group to which the file belongs
df_files_with_group['group'] = np.where(
    df_files_with_group['is_tumor_file_of_interest'] == 1,
    df_files_with_group['pam50_mrna'],
    'Paired Normal'
)

# Drop unnecessary columns
df_files_with_group = df_files_with_group \
    [['file_id', 'experimental_strategy', 'data_format', 'group']]

In [4]:
# Print the DataFrame that associates each file with its group
df_files_with_group

Unnamed: 0,file_id,experimental_strategy,data_format,group
0,bc98d560-25d3-4e7c-8212-00d1f5148dde,miRNA-Seq,TXT,Luminal B
1,2ebd1a3f-a299-492c-96ba-6733293d1ba4,miRNA-Seq,TXT,Luminal A
2,d77f7eb5-1943-4147-8a02-c33289a8ed75,miRNA-Seq,TXT,Luminal B
3,fb32f0b1-4fb7-43d5-8091-dc13a1f6d9e8,RNA-Seq,TSV,Luminal B
4,c7637c63-cfdc-448d-b68e-bbf08c8382be,miRNA-Seq,TXT,Luminal A
...,...,...,...,...
1079,297b8ef7-71ca-4ea0-9315-da31f7f8914e,miRNA-Seq,TXT,Luminal A
1080,f1934efb-1c8a-4c72-ba94-d263ef0d0cd4,RNA-Seq,TSV,Luminal B
1081,fd94a0ed-37d8-49a5-af96-e2160a9e6096,RNA-Seq,TSV,Paired Normal
1082,6e7cf482-6c96-41ea-877e-27761eef46ac,miRNA-Seq,TXT,Luminal A


## Basal-like Files

In [5]:
# Download files related to the basal-like group
gdc_api_file_download(df_files_with_group, 'Basal-like')

INFO:Downloading file d8f4fee7-bb7b-48d1-8d15-7ee33cc5f339 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like/mirna-seq_d8f4fee7-bb7b-48d1-8d15-7ee33cc5f339.txt
INFO:Downloading file 0ed77a2e-a2c9-4557-8ba3-2c8bade9b792 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like/rna-seq_0ed77a2e-a2c9-4557-8ba3-2c8bade9b792.tsv
INFO:Downloading file 4d85e3a8-263a-4f3a-8b40-0aa1f20944d3 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like/mirna-seq_4d85e3a8-263a-4f3a-8b40-0aa1f20944d3.txt
INFO:Downloading file d0f845cb-f147-4d1b-9759-b842c3894b95 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like/mirna-seq_d0f845cb-f147-4d1b-9759-b842c3894b95.txt
INFO:Downloading file d9ea5bde-aa1a-4210-976f-b2f67c6d0191 [attempt 1]

## HER2-enriched Files

In [6]:
# Download files related to the HER2-enriched group
gdc_api_file_download(df_files_with_group, 'HER2-enriched')

INFO:Downloading file a58e2ae3-a236-4209-8292-70465216cb85 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched/rna-seq_a58e2ae3-a236-4209-8292-70465216cb85.tsv
INFO:Downloading file 09a491b5-1bfb-47d9-963f-69f6c7e6c1e5 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched/rna-seq_09a491b5-1bfb-47d9-963f-69f6c7e6c1e5.tsv
INFO:Downloading file be8e6554-07ae-4ec2-8be0-8598556467c9 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched/mirna-seq_be8e6554-07ae-4ec2-8be0-8598556467c9.txt
INFO:Downloading file fb397586-1b5b-40e0-a885-8663f9c76f6d [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched/rna-seq_fb397586-1b5b-40e0-a885-8663f9c76f6d.tsv
INFO:Downloading file ed76d786-d744-4067-b288-5fe2dc06a310 [at

## Luminal A Files

In [7]:
# Download files related to the luminal A group
gdc_api_file_download(df_files_with_group, 'Luminal A')

INFO:Downloading file 2ebd1a3f-a299-492c-96ba-6733293d1ba4 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a/mirna-seq_2ebd1a3f-a299-492c-96ba-6733293d1ba4.txt
INFO:Downloading file c7637c63-cfdc-448d-b68e-bbf08c8382be [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a/mirna-seq_c7637c63-cfdc-448d-b68e-bbf08c8382be.txt
INFO:Downloading file ecf9996e-0cba-4394-9f5a-51fceab1ac56 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a/rna-seq_ecf9996e-0cba-4394-9f5a-51fceab1ac56.tsv
INFO:Downloading file 36b1e97e-353e-4a4d-8a27-04f7ebabc78c [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a/mirna-seq_36b1e97e-353e-4a4d-8a27-04f7ebabc78c.txt
INFO:Downloading file c4afcca9-94e4-4ecd-98d1-0aab86de8685 [attempt 1]...


## Luminal B Files

In [8]:
# Download files related to the luminal B group
gdc_api_file_download(df_files_with_group, 'Luminal B')

INFO:Downloading file bc98d560-25d3-4e7c-8212-00d1f5148dde [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b/mirna-seq_bc98d560-25d3-4e7c-8212-00d1f5148dde.txt
INFO:Downloading file d77f7eb5-1943-4147-8a02-c33289a8ed75 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b/mirna-seq_d77f7eb5-1943-4147-8a02-c33289a8ed75.txt
INFO:Downloading file fb32f0b1-4fb7-43d5-8091-dc13a1f6d9e8 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b/rna-seq_fb32f0b1-4fb7-43d5-8091-dc13a1f6d9e8.tsv
INFO:Downloading file cf5ab327-47ee-40eb-a06c-3a8fef2a1ebb [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b/mirna-seq_cf5ab327-47ee-40eb-a06c-3a8fef2a1ebb.txt
INFO:Downloading file 50c0b143-29b8-4060-a63e-d263ae52b029 [attempt 1]...


## Paired Normal Files

In [9]:
# Download files related to the paired normal group
gdc_api_file_download(df_files_with_group, 'Paired Normal')

INFO:Downloading file 456bc30b-59f8-4427-b798-5b113ca635a0 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal/rna-seq_456bc30b-59f8-4427-b798-5b113ca635a0.tsv
INFO:Downloading file 7e17b2bf-91b2-4ea2-acc1-59e1537913ce [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal/mirna-seq_7e17b2bf-91b2-4ea2-acc1-59e1537913ce.txt
INFO:Downloading file c6a4afd8-8044-475f-b4fd-a1b4cb922976 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal/rna-seq_c6a4afd8-8044-475f-b4fd-a1b4cb922976.tsv
INFO:Downloading file 12c76136-cc83-42c0-886b-4b4ea167a0d7 [attempt 1]...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal/mirna-seq_12c76136-cc83-42c0-886b-4b4ea167a0d7.txt
INFO:Downloading file 75182885-7501-49b1-bb0d-8a88da1080a3 [

# Expression Aggregation

## Functions

In [10]:
def process_mir_seq_files(group):
    """
    Process and aggregate TCGA miRNA-Seq files for a given sample group.

    This function reads raw miRNA-Seq files for the specified group, processes
    the data by extracting genomic and microRNA metadata, flags microRNAs of
    interest, and stores the processed files. It also aggregates raw and normalized
    read counts per accession ID across all samples in the group, storing the results
    as CSV files in the interim directory.

    Parameters
    ----------
    group : str
        The name of the sample group to process. The name is converted to
        lowercase and spaces are replaced with hyphens to match directory naming.

    Returns
    -------
    dict
        A dictionary containing:
        - 'sample_reads' : pandas.DataFrame
            The processed miRNA-Seq DataFrame for the last file processed.
        - 'agg_raw_reads' : pandas.DataFrame
            Aggregated raw read counts for microRNAs of interest, indexed by
            accession ID and with one column per sample.
        - 'agg_norm_reads' : pandas.DataFrame
            Aggregated normalized read counts (reads per million) for microRNAs
            of interest, indexed by accession ID and with one column per sample.

    Notes
    -----
    - The function assumes the existence of three directories for each group:
      raw, interim, and processed, as specified in the global
      ``TCGA_DATA_DIRS`` dictionary.
    - Only files starting with ``mirna-seq_`` in the raw directory are processed.
    - miRNAs of interest are defined as those with a non-null accession ID
      (MIMAT ID) in the ``miRNA_region`` field.
    - The aggregation step sums read counts and normalized counts across all
      isoforms for each accession ID per sample.

    Examples
    --------
    >>> results = process_mir_seq_files('HER2-enriched')
    >>> results['mir_reads'].head()
          mirna_id genome_assembly chromosome position_range strand  ...
    >>> results['agg_raw_reads'].shape
    (2000, 11)
    >>> results['agg_norm_reads'].iloc[:5, :5]
      accession_id  sample1  sample2  sample3  sample4
    MIMAT0000062       1200     1300     1250     1400
    """
    # Directory paths to read and write the files
    group_dir = (group.lower()).replace(' ', '-')
    raw_dir_path = TCGA_DATA_DIRS['raw'][group_dir]
    interim_dir_path = TCGA_DATA_DIRS['interim'][group_dir]
    processed_dir_path = TCGA_DATA_DIRS['processed'][group_dir]
    
    # List the miRNA-Seq files contained in the raw data directory
    file_names = [f for f in os.listdir(raw_dir_path) if f.startswith('mirna-seq_')]
    
    # Initialize the DataFrames of aggregated reads
    primary_key = 'accession_id'
    df_agg_raw_reads = pd.DataFrame(columns=[primary_key])
    df_agg_norm_reads = pd.DataFrame(columns=[primary_key])
    
    # Process miRNA-Seq files individually
    for file_name in file_names:
        # Retrieve the file id
        file_id = file_name.replace('mirna-seq_', '')
        file_id = file_id.replace('.txt', '')
        
        # Create a DataFrame for the file
        df_reads = pd.read_csv(
            os.path.join(raw_dir_path, file_name), sep='\t'
        )
        
        # Split the isoform coordinates column into four other columns
        df_reads[['genome_assembly', 'chromosome', 'position_range', 'strand']] = \
            df_reads['isoform_coords'].str.split(pat=':', n=-1, expand=True)

        # Split the microRNA region column into two other columns
        df_reads[['region_type', 'accession_id']] = \
            df_reads['miRNA_region'].str.split(pat=',', n=-1, expand=True)
        
        # Flag the microRNAs of interest, i.e. those with isoforms associated with
        # a specific accession identifier (MIMAT ID)
        df_reads['is_mirna_of_interest'] = np.where(
            df_reads['accession_id'].notna(), 1, 0
        )
        
        # Rearrange the DataFrame columns
        df_reads = df_reads \
            .rename(columns={
                'cross-mapped': 'cross_mapped',
                'miRNA_ID': 'mirna_id',
                'reads_per_million_miRNA_mapped': 'reads_per_million',
            }) \
            [[
                'mirna_id',
                'genome_assembly',
                'chromosome',
                'position_range',
                'strand',
                'cross_mapped',
                'region_type',
                'accession_id',
                'read_count',
                'reads_per_million',
                'is_mirna_of_interest',
            ]]
        
        # Store the DataFrame of the processed file into a CSV file
        df_reads.to_csv(os.path.join(processed_dir_path, file_name), index=False)
        
        # Aggregate reads according to the accession ID of the microRNAs of interest
        df_agg_reads = df_reads \
            .query('is_mirna_of_interest == 1') \
            .groupby(primary_key) \
            .agg(
                raw_reads = pd.NamedAgg(column='read_count', aggfunc='sum'),
                normalized_reads = pd.NamedAgg(column='reads_per_million', aggfunc='sum'),
            ) \
            .reset_index()
        
        # Merge the aggregated raw reads into the respective DataFrame
        df_agg_raw_reads = df_agg_reads \
            .drop(columns=['normalized_reads']) \
            .rename(columns={'raw_reads': file_id}) \
            .merge(
                right=df_agg_raw_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
            
        # Merge the aggregated normalized reads into the respective DataFrame
        df_agg_norm_reads = df_agg_reads \
            .drop(columns=['raw_reads']) \
            .rename(columns={'normalized_reads': file_id}) \
            .merge(
                right=df_agg_norm_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
    
    # Fill aggregated NaN reads with zero
    df_agg_raw_reads = df_agg_raw_reads.fillna(0)
    df_agg_norm_reads = df_agg_norm_reads.fillna(0)
    
    # Convert sample columns from aggregated raw reads to integers
    sample_cols = df_agg_raw_reads.select_dtypes(include=['number']).columns
    df_agg_raw_reads[sample_cols] = df_agg_raw_reads[sample_cols].astype(int)
    
    # Store the DataFrames of aggregated reads in CSV files
    df_agg_raw_reads.to_csv(
        os.path.join(interim_dir_path, EXPRESSION_FILES['agg-mirs-raw-reads']),
        index=False
    )
    df_agg_norm_reads.to_csv(
        os.path.join(interim_dir_path, EXPRESSION_FILES['agg-mirs-norm-reads']),
        index=False
    )
    
    # Return a sample of the processing and the aggregated reads DataFrames
    return {
        'sample_reads': df_reads,
        'agg_raw_reads': df_agg_raw_reads,
        'agg_norm_reads': df_agg_norm_reads,
    }

In [11]:
def process_rna_seq_files(group):
    """
    Process and aggregate TCGA RNA-Seq files for a given sample group.

    This function reads raw RNA-Seq files for the specified group, processes
    the data by filtering for protein-coding genes, and stores the processed
    files. It also aggregates raw and normalized read counts per gene across
    all samples in the group, storing the results as CSV files in the interim
    directory.

    Parameters
    ----------
    group : str
        The name of the sample group to process. The name is converted to
        lowercase and spaces are replaced with hyphens to match directory naming.

    Returns
    -------
    dict
        A dictionary containing:
        - 'sample_reads' : pandas.DataFrame
            The processed RNA-Seq DataFrame for the last file processed.
        - 'agg_raw_reads' : pandas.DataFrame
            Aggregated raw read counts for protein-coding genes, indexed by
            ``gene_id`` and ``gene_name``, with one column per sample.
        - 'agg_norm_reads' : pandas.DataFrame
            Aggregated normalized read counts for protein-coding genes, indexed by
            ``gene_id`` and ``gene_name``, with one column per sample.

    Notes
    -----
    - The function assumes the existence of three directories for each group:
      raw, interim, and processed, as specified in the global
      ``TCGA_DATA_DIRS`` dictionary.
    - Only files starting with ``rna-seq_`` in the raw directory are processed.
    - Protein-coding genes are identified using the ``gene_type`` column.
    - The raw and normalized read count columns are defined in the global
      ``EXPRESSION_RETRIEVAL_AND_AGGREGATION_SETUP`` dictionary.

    Examples
    --------
    >>> results = process_rna_seq_files('HER2-enriched')
    >>> results['sample_reads'].head()
          gene_id    gene_name unstranded  tpm
    >>> results['agg_raw_reads'].shape
    (25000, 11)
    >>> results['agg_norm_reads'].iloc[:5, :5]
         gene_id   gene_name   sample1   sample2   sample3
    ENSG000001  BRCA1         1500      1600      1580
    """
    # Directory paths to read and write the files
    group_dir = (group.lower()).replace(' ', '-')
    raw_dir_path = TCGA_DATA_DIRS['raw'][group_dir]
    interim_dir_path = TCGA_DATA_DIRS['interim'][group_dir]
    processed_dir_path = TCGA_DATA_DIRS['processed'][group_dir]

    # List the RNA-Seq files contained in the raw data directory
    file_names = [f for f in os.listdir(raw_dir_path) if f.startswith('rna-seq_')]

    # Initialize the DataFrames of aggregated reads
    primary_key = ['gene_id', 'gene_name']
    df_agg_raw_reads = pd.DataFrame(columns=primary_key)
    df_agg_norm_reads = pd.DataFrame(columns=primary_key)
    
    # Define the columns that will be used in the aggregation
    raw_reads_columns = \
        EXPRESSION_RETRIEVAL_AND_AGGREGATION_SETUP['rna-seq-raw-reads-column']
    norm_reads_columns = \
        EXPRESSION_RETRIEVAL_AND_AGGREGATION_SETUP['rna-seq-norm-reads-column']

    # Process RNA-Seq files individually 
    for file_name in file_names:
        # Retrieve the file id
        file_id = file_name.replace('rna-seq_', '')
        file_id = file_id.replace('.tsv', '')
        
        # Create a DataFrame for the file
        df_reads = pd.read_csv(
            os.path.join(raw_dir_path, file_name), sep='\t', skiprows=1
        )
        
        # Remove the initial count rows
        df_reads = df_reads.iloc[4:, :].reset_index(drop=True)
        
        # Flag the messenger RNAs of interest, i.e. protein coding genes
        df_reads['is_mrna_of_interest'] = np.where(
            df_reads['gene_type'] == 'protein_coding', 1, 0
        )
        
        # Store the DataFrame of the processed file into a CSV file
        df_reads.to_csv(os.path.join(processed_dir_path, file_name), index=False)
        
        # Filter the messenger RNA reads and their unstranded related counts
        df_mrna_reads = df_reads \
            .query('is_mrna_of_interest == 1') \
            [['gene_id', 'gene_name', raw_reads_columns, norm_reads_columns]]
        
        # Merge the aggregated raw reads into the respective DataFrame
        df_agg_raw_reads = df_mrna_reads \
            .drop(columns=[norm_reads_columns]) \
            .rename(columns={raw_reads_columns: file_id}) \
            .merge(
                right=df_agg_raw_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
        
        # Merge the aggregated normalized reads into the respective DataFrame
        df_agg_norm_reads = df_mrna_reads \
            .drop(columns=[raw_reads_columns]) \
            .rename(columns={norm_reads_columns: file_id}) \
            .merge(
                right=df_agg_norm_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
    
    # Store the DataFrames of aggregated reads in CSV files
    df_agg_raw_reads.to_csv(
        os.path.join(interim_dir_path, EXPRESSION_FILES['agg-mrnas-raw-reads']),
        index=False
    )
    df_agg_norm_reads.to_csv(
        os.path.join(interim_dir_path, EXPRESSION_FILES['agg-mrnas-norm-reads']),
        index=False
    )
    
    # Return a sample of the processing and the aggregated reads DataFrames
    return {
        'sample_reads': df_reads,
        'agg_raw_reads': df_agg_raw_reads,
        'agg_norm_reads': df_agg_norm_reads,
    }

## Basal-like Files

In [12]:
# Process the files related to the basal-like group
mir_seq_artifacts = process_mir_seq_files('Basal-like')
rna_seq_artifacts = process_rna_seq_files('Basal-like')

### miRNA-Seq Artifacts

In [13]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_artifacts['sample_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175938-94175962,+,N,precursor,,1,0.991229,0
1,hsa-let-7a-1,hg38,chr9,94175961-94175983,+,N,mature,MIMAT0000062,2,1.982457,1
2,hsa-let-7a-1,hg38,chr9,94175961-94175984,+,N,mature,MIMAT0000062,2,1.982457,1
3,hsa-let-7a-1,hg38,chr9,94175962-94175981,+,N,mature,MIMAT0000062,23,22.798258,1
4,hsa-let-7a-1,hg38,chr9,94175962-94175982,+,N,mature,MIMAT0000062,1056,1046.737421,1
...,...,...,...,...,...,...,...,...,...,...,...
3726,hsa-mir-99b,hg38,chr19,51692656-51692675,+,N,mature,MIMAT0004678,29,28.745630,1
3727,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,26,25.771944,1
3728,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,10,9.912286,1
3729,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,64,63.438632,1


In [14]:
# Print the DataFrame of aggregated microRNA raw reads for this group
mir_seq_artifacts['agg_raw_reads']

Unnamed: 0,accession_id,d2d4e49b-9070-4123-a0a1-26c1ae412f90,f9b78f39-e96f-4ca4-9149-5c19c68b0d5e,9829840b-4803-4e25-8325-a74a2ca16dc2,60f0ab3d-cce7-44c8-b56e-dc96f5c44a9b,4af362e9-6809-4f30-a59c-49627ab987c2,2a86ab10-4cd3-47d3-8f09-f92165a6da71,f084f7fb-d15c-43bb-bb99-f056b771fa62,44877412-0b2c-4dbf-a414-94ef673887f6,9042f38b-948f-4f8d-a1e1-8f7050550500,...,a3ee5904-e876-4ee4-a9b0-ff5d0b3d275f,a473ce1d-15e1-4ca2-bcba-fa36367a7f0d,ae89d22f-6bff-493e-8ec8-59987ccc1785,d883c427-a678-4ff1-b7de-49ba0d305b2f,a9a3d49c-6d6d-4989-b3ec-c0679684621a,4cb946b2-4aec-4300-9c26-0fc36d0b95d0,3f7848d9-ee82-4648-92d8-e0188275abf4,f83b6eb6-444b-4fe0-9f53-3f152537500d,95434759-f95d-4e7a-bc24-c8d8a2dc6d2c,0961f723-1c0b-4a2a-8a86-861bf407085b
0,MIMAT0000062,15691,14840,28616,60281,33654,33329,46997,14565,15939,...,19906,157699,16914,26001,18853,20870,23339,33465,44783,132234
1,MIMAT0000063,23004,25232,18767,47459,52811,40587,29576,34535,14432,...,33636,92294,9100,28963,22620,49806,39282,39633,34232,123894
2,MIMAT0000064,2375,1524,6701,2512,1328,4111,1410,2129,6607,...,693,40294,1442,3964,2942,1062,3716,4763,3762,685
3,MIMAT0000065,242,420,969,1394,547,366,312,249,428,...,694,776,381,1112,751,534,506,700,602,1269
4,MIMAT0000066,798,894,1103,1891,1140,1033,1570,501,715,...,890,6701,1276,1445,2158,1027,695,1682,2045,4021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,MIMAT0031890,1,10,2,1,16,0,0,3,4,...,0,3,0,4,0,2,2,0,1,15
1915,MIMAT0031893,3,2,1,8,5,2,4,13,12,...,5,16,5,6,18,1,2,29,4,4
1916,MIMAT0032026,1,6,3,39,2,1,4,5,7,...,3,21,1,17,11,3,7,3,0,2
1917,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Print the DataFrame of aggregated microRNA normalized reads for this group
mir_seq_artifacts['agg_norm_reads']

Unnamed: 0,accession_id,d2d4e49b-9070-4123-a0a1-26c1ae412f90,f9b78f39-e96f-4ca4-9149-5c19c68b0d5e,9829840b-4803-4e25-8325-a74a2ca16dc2,60f0ab3d-cce7-44c8-b56e-dc96f5c44a9b,4af362e9-6809-4f30-a59c-49627ab987c2,2a86ab10-4cd3-47d3-8f09-f92165a6da71,f084f7fb-d15c-43bb-bb99-f056b771fa62,44877412-0b2c-4dbf-a414-94ef673887f6,9042f38b-948f-4f8d-a1e1-8f7050550500,...,a3ee5904-e876-4ee4-a9b0-ff5d0b3d275f,a473ce1d-15e1-4ca2-bcba-fa36367a7f0d,ae89d22f-6bff-493e-8ec8-59987ccc1785,d883c427-a678-4ff1-b7de-49ba0d305b2f,a9a3d49c-6d6d-4989-b3ec-c0679684621a,4cb946b2-4aec-4300-9c26-0fc36d0b95d0,3f7848d9-ee82-4648-92d8-e0188275abf4,f83b6eb6-444b-4fe0-9f53-3f152537500d,95434759-f95d-4e7a-bc24-c8d8a2dc6d2c,0961f723-1c0b-4a2a-8a86-861bf407085b
0,MIMAT0000062,15553.368251,16952.773275,31382.526640,30577.157094,18851.943836,37318.371199,36884.403185,13606.071302,15602.806323,...,24042.776002,19676.811614,20219.166489,11182.821784,16409.095007,23675.069602,19392.008713,20151.395900,39048.494316,47203.619106
1,MIMAT0000063,22802.223128,28824.284048,20581.348805,24073.278455,29583.110661,45445.129825,23211.973287,32261.288881,14127.592753,...,40626.083262,11515.936381,10878.231941,12456.754252,19687.780688,56500.264318,32638.797145,23865.539337,29848.559887,44226.486270
2,MIMAT0000064,2354.167968,1740.972135,7348.836701,1274.196160,743.905074,4603.073120,1106.602731,1988.831156,6467.641719,...,837.016166,5027.663124,1723.781369,1704.884641,2560.630003,1204.740006,3087.566067,2868.103948,3280.272327,244.524699
3,MIMAT0000065,239.877328,479.795470,1062.680610,707.097710,306.412704,409.808993,244.865286,232.606367,418.972398,...,838.223982,96.825000,455.451249,478.262292,653.648239,605.773222,420.427456,421.514333,524.913328,452.995390
4,MIMAT0000066,791.000438,1021.278929,1209.635410,959.197825,638.593211,1156.646688,1232.174670,468.015223,699.918846,...,1074.955826,836.113829,1525.343291,621.482923,1878.259534,1165.035769,577.464589,1012.838725,1783.135807,1435.377833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,MIMAT0031890,0.991229,11.423701,2.193356,0.507244,8.962710,0.000000,0.000000,2.802486,3.915630,...,0.000000,0.374322,0.000000,1.720368,0.000000,2.268814,1.661768,0.000000,0.871949,5.354555
1915,MIMAT0031893,2.973687,2.284740,1.096678,4.057950,2.800847,2.239394,3.139299,12.144108,11.746889,...,6.039079,1.996390,5.977050,2.580552,15.666667,1.134407,1.661768,17.462737,3.487796,1.427881
1916,MIMAT0032026,0.991229,6.854221,3.290033,19.782504,1.120339,1.119697,3.139299,4.670810,6.852352,...,3.623447,2.620264,1.195410,7.311564,9.574075,3.403221,5.816190,1.806490,0.000000,0.713940
1917,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### RNA-Seq Artifacts

In [16]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_artifacts['sample_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,2836,1446,1390,45.6189,17.2484,19.5427,1
1,ENSG00000000005.6,TNMD,protein_coding,33,20,13,1.6313,0.6168,0.6988,1
2,ENSG00000000419.13,DPM1,protein_coding,1886,947,939,114.0108,43.1073,48.8412,1
3,ENSG00000000457.14,SCYL3,protein_coding,1473,1364,1406,15.6148,5.9039,6.6892,1
4,ENSG00000000460.17,C1orf112,protein_coding,937,1179,1139,11.4519,4.3299,4.9059,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,495,256,258,19.9875,7.5572,8.5625,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,0,0,0,0.0000,0.0000,0.0000,1


In [17]:
# Print the DataFrame of aggregated messenger RNA raw reads for this group
rna_seq_artifacts['agg_raw_reads']

Unnamed: 0,gene_id,gene_name,7882ff91-e415-48ae-9151-26b55f1cfc57,95f4620f-f9b0-4533-bb13-f412d774244a,ff1f2f31-8607-4627-b487-c5e39e7c30f5,54600389-6de5-44ce-ada1-8a8508b50980,c95a37a1-d3cd-421c-bc5f-0f8dc448f64c,4dd3d7e2-8a60-41e8-8812-a06f413afe2b,ada5cdcf-f9e0-4209-aa92-ac3b921bbdf4,7d8935d4-33df-4a11-a352-e4e1acc563a4,...,71ce6854-2231-48ad-ba1b-54bb4ec7d713,58764dda-7c61-4705-a834-4d654c60ef2b,27f4c9d4-d6ba-4fea-9f0b-fb06066e69a8,37939d7d-074b-42b7-8d68-95a8afef4c38,a436c52a-e63f-4d61-bc60-eb497c196489,77f150b1-5f40-442d-91c5-6d3571513513,c6fd0f95-d74a-49e3-9a70-fec12e63ff1a,f2a4b38d-bfc0-4363-9a58-b5fa2af26ff3,6e8e0eef-1fc8-446c-9038-3a06657784ec,bc23716d-70e8-486e-b512-8c45654d5d83
0,ENSG00000000003.15,TSPAN6,2836,4671,2127,2665,3875,5195,7874,5589,...,2765,3298,8037,2480,648,6342,2581,4679,1882,7389
1,ENSG00000000005.6,TNMD,33,0,0,1,5,1,22,41,...,2,11,1,11,0,5,5,2,1,20
2,ENSG00000000419.13,DPM1,1886,1930,742,3191,870,3974,3951,1931,...,1393,4596,2814,1845,1475,1913,2766,2570,3138,1366
3,ENSG00000000457.14,SCYL3,1473,2584,911,1104,649,824,3720,2055,...,909,1898,1920,949,1412,640,2002,1342,628,650
4,ENSG00000000460.17,C1orf112,937,836,328,819,257,1518,1352,955,...,700,2984,795,1024,299,665,799,1121,455,1414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,0,5,1,1,6,0,18,7,...,6,6,6,8,7,2,5,14,9,14


In [18]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this group
rna_seq_artifacts['agg_norm_reads']

Unnamed: 0,gene_id,gene_name,7882ff91-e415-48ae-9151-26b55f1cfc57,95f4620f-f9b0-4533-bb13-f412d774244a,ff1f2f31-8607-4627-b487-c5e39e7c30f5,54600389-6de5-44ce-ada1-8a8508b50980,c95a37a1-d3cd-421c-bc5f-0f8dc448f64c,4dd3d7e2-8a60-41e8-8812-a06f413afe2b,ada5cdcf-f9e0-4209-aa92-ac3b921bbdf4,7d8935d4-33df-4a11-a352-e4e1acc563a4,...,71ce6854-2231-48ad-ba1b-54bb4ec7d713,58764dda-7c61-4705-a834-4d654c60ef2b,27f4c9d4-d6ba-4fea-9f0b-fb06066e69a8,37939d7d-074b-42b7-8d68-95a8afef4c38,a436c52a-e63f-4d61-bc60-eb497c196489,77f150b1-5f40-442d-91c5-6d3571513513,c6fd0f95-d74a-49e3-9a70-fec12e63ff1a,f2a4b38d-bfc0-4363-9a58-b5fa2af26ff3,6e8e0eef-1fc8-446c-9038-3a06657784ec,bc23716d-70e8-486e-b512-8c45654d5d83
0,ENSG00000000003.15,TSPAN6,45.6189,61.4365,34.2301,29.8406,33.3467,53.4313,53.7850,72.2034,...,37.3545,55.3034,96.5758,28.8647,10.0611,79.1792,31.8307,60.8935,21.6964,77.3596
1,ENSG00000000005.6,TNMD,1.6313,0.0000,0.0000,0.0344,0.1322,0.0316,0.4618,1.6278,...,0.0830,0.5669,0.0369,0.3935,0.0000,0.1918,0.1895,0.0800,0.0354,0.6435
2,ENSG00000000419.13,DPM1,114.0108,95.3981,44.8756,134.2774,28.1363,153.6044,101.4235,93.7501,...,70.7237,289.6323,127.0762,80.7007,86.0655,89.7565,128.1966,125.6946,135.9525,53.7458
3,ENSG00000000457.14,SCYL3,15.6148,22.3977,9.6617,8.1466,3.6806,5.5851,16.7457,17.4957,...,8.0930,20.9746,15.2044,7.2791,14.4478,5.2658,16.2711,11.5097,4.7712,4.4847
4,ENSG00000000460.17,C1orf112,11.4519,8.3545,4.0106,6.9678,1.6804,11.8626,7.0168,9.3740,...,7.1853,38.0188,7.2584,9.0555,3.5273,6.3082,7.4869,11.0847,3.9855,11.2480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0000,0.0310,0.0076,0.0053,0.0243,0.0000,0.0580,0.0426,...,0.0382,0.0474,0.0340,0.0439,0.0512,0.0118,0.0291,0.0859,0.0489,0.0691


## HER2-enriched Files

In [19]:
# Process the files related to the HER2-enriched group
mir_seq_artifacts = process_mir_seq_files('HER2-enriched')
rna_seq_artifacts = process_rna_seq_files('HER2-enriched')

### miRNA-Seq Artifacts

In [20]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_artifacts['sample_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175960-94175984,+,N,mature,MIMAT0000062,2,1.497467,1
1,hsa-let-7a-1,hg38,chr9,94175960-94175985,+,N,mature,MIMAT0000062,1,0.748733,1
2,hsa-let-7a-1,hg38,chr9,94175961-94175982,+,N,mature,MIMAT0000062,3,2.246200,1
3,hsa-let-7a-1,hg38,chr9,94175961-94175983,+,N,mature,MIMAT0000062,1,0.748733,1
4,hsa-let-7a-1,hg38,chr9,94175961-94175984,+,N,mature,MIMAT0000062,10,7.487333,1
...,...,...,...,...,...,...,...,...,...,...,...
2273,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,2,1.497467,1
2274,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,2,1.497467,1
2275,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,19,14.225933,1
2276,hsa-mir-99b,hg38,chr19,51692656-51692679,+,N,mature,MIMAT0004678,9,6.738600,1


In [21]:
# Print the DataFrame of aggregated microRNA raw reads for this group
mir_seq_artifacts['agg_raw_reads']

Unnamed: 0,accession_id,6dc4e6bf-a8f6-4280-af19-5ed501969f01,86e3408b-76a3-4834-b85e-4d2a07677304,a44df6fa-36a7-4500-90fd-d62baf48ef52,3abbb570-87ad-44c5-b66d-141f63b0ca8b,a4fc56a4-2493-451a-9b30-5aa38258bcce,be8e6554-07ae-4ec2-8be0-8598556467c9,d68e848a-5464-4471-88bd-3faf7e088d91,f3a7612e-35c3-43be-8f96-795874f8225c,0d003664-43ff-4eb5-b6c0-28ba7f26ee64,...,dc300e50-f729-4d0e-8a33-606e95d91c95,585f73db-7dc1-49d8-a601-f8d33b318a95,646eb67c-2e86-4f89-867f-78c31eff5fe0,b70a538a-a0d7-4a0f-ab48-fd7b4457bfd3,aafbf8fb-290d-408e-b104-241b7d492895,3d4b6861-2763-451d-ae0a-6a5adfc591ec,8ec31b5f-bb8d-4c61-aac5-3204b66e81e2,a576843f-a9f7-47cf-a52f-7f01066fa160,54fb48bb-ab47-4870-beb0-5713d15d8b58,ba7c9ae6-bb71-4bb4-8f3d-f2041f62896f
0,MIMAT0000062,25603,79791,11903,28998,35515,41206,94350,15868,20083,...,49811,24756,18136,27867,36868,54800,64370,339513,184624,54673
1,MIMAT0000063,32936,47659,14327,89908,60550,78330,107887,20715,38497,...,67531,20726,16079,12006,26816,63216,42832,274597,239469,58891
2,MIMAT0000064,955,6106,912,3128,1675,1121,2694,915,2147,...,6580,2728,2828,2833,3829,1070,9345,19367,11566,3635
3,MIMAT0000065,373,877,157,982,681,1065,340,192,121,...,512,1071,418,241,407,765,836,541,2001,680
4,MIMAT0000066,537,1906,1609,1527,3295,1236,2637,1364,919,...,725,1570,441,1689,1281,558,2441,13747,4370,3197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749,MIMAT0031892,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1750,MIMAT0031893,2,9,7,7,11,11,2,4,1,...,4,1,11,2,1,3,0,38,13,3
1751,MIMAT0032026,1,4,1,2,5,1,1,0,1,...,1,8,2,3,2,6,4,1,9,0
1752,MIMAT0032029,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [22]:
# Print the DataFrame of aggregated microRNA normalized reads for this group
mir_seq_artifacts['agg_norm_reads']

Unnamed: 0,accession_id,6dc4e6bf-a8f6-4280-af19-5ed501969f01,86e3408b-76a3-4834-b85e-4d2a07677304,a44df6fa-36a7-4500-90fd-d62baf48ef52,3abbb570-87ad-44c5-b66d-141f63b0ca8b,a4fc56a4-2493-451a-9b30-5aa38258bcce,be8e6554-07ae-4ec2-8be0-8598556467c9,d68e848a-5464-4471-88bd-3faf7e088d91,f3a7612e-35c3-43be-8f96-795874f8225c,0d003664-43ff-4eb5-b6c0-28ba7f26ee64,...,dc300e50-f729-4d0e-8a33-606e95d91c95,585f73db-7dc1-49d8-a601-f8d33b318a95,646eb67c-2e86-4f89-867f-78c31eff5fe0,b70a538a-a0d7-4a0f-ab48-fd7b4457bfd3,aafbf8fb-290d-408e-b104-241b7d492895,3d4b6861-2763-451d-ae0a-6a5adfc591ec,8ec31b5f-bb8d-4c61-aac5-3204b66e81e2,a576843f-a9f7-47cf-a52f-7f01066fa160,54fb48bb-ab47-4870-beb0-5713d15d8b58,ba7c9ae6-bb71-4bb4-8f3d-f2041f62896f
0,MIMAT0000062,19169.819454,24930.098790,8118.425479,16161.456276,14824.353426,21817.053575,58821.695754,25629.884947,24718.694540,...,16514.012091,15666.708438,16135.331803,22814.687460,19471.623333,22059.832868,20234.954324,49595.646620,33263.158276,27074.674356
1,MIMAT0000063,24660.280969,14890.696676,9771.711485,50108.428557,25274.239059,41472.839074,67261.221948,33458.726161,47383.139152,...,22388.784620,13116.343474,14305.249233,9829.301237,14162.717028,25447.707932,13464.402105,40112.796195,43144.419189,29163.474618
2,MIMAT0000064,715.040331,1907.773847,622.028400,1743.328340,699.163506,593.528055,1679.551121,1477.901734,2642.585131,...,2181.490022,1726.400897,2516.029904,2319.374513,2022.264449,430.730313,2937.636293,2829.107836,2083.811902,1800.092207
3,MIMAT0000065,279.277532,274.012060,107.081644,547.298091,284.256922,563.878126,211.970074,310.117084,148.930044,...,169.745117,677.776892,371.888437,197.306479,214.954722,307.952049,262.799781,79.028623,360.514230,336.743521
4,MIMAT0000066,402.069798,595.515387,1097.416332,851.042959,1375.369408,654.416304,1644.014963,2203.123459,1131.129825,...,240.361743,993.566499,392.351198,1382.782758,676.552823,224.623846,767.337634,2008.145061,787.329933,1583.189762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749,MIMAT0031892,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1750,MIMAT0031893,1.497466,2.811981,4.774341,3.901310,4.591521,5.824093,1.246883,6.460772,1.230827,...,1.326134,0.632845,9.786537,1.637398,0.528144,1.207656,0.000000,5.550994,2.342171,1.485633
1751,MIMAT0032026,0.748733,1.249769,0.682049,1.114660,2.087055,0.529463,0.623441,0.000000,1.230827,...,0.331533,5.062759,1.779370,2.456097,1.056289,2.415310,1.257415,0.146079,1.621503,0.000000
1752,MIMAT0032029,0.000000,0.312442,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.632845,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### RNA-Seq Artifacts

In [23]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_artifacts['sample_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,955,484,471,13.4312,3.5759,3.5180,1
1,ENSG00000000005.6,TNMD,protein_coding,3,2,1,0.1297,0.0345,0.0340,1
2,ENSG00000000419.13,DPM1,protein_coding,3231,1625,1606,170.7707,45.4661,44.7294,1
3,ENSG00000000457.14,SCYL3,protein_coding,1763,1471,1385,16.3402,4.3504,4.2799,1
4,ENSG00000000460.17,C1orf112,protein_coding,899,1023,1023,9.6066,2.5577,2.5162,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,416,223,209,14.6865,3.9102,3.8468,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,9,4,5,0.0597,0.0159,0.0156,1


In [24]:
# Print the DataFrame of aggregated messenger RNA raw reads for this group
rna_seq_artifacts['agg_raw_reads']

Unnamed: 0,gene_id,gene_name,79f97f73-b650-49cf-8654-53038a6c0de7,6b62d6dd-bb6f-4f39-9041-992c47b875f3,e4e099ea-7207-48b8-a578-d5e6d83054b0,7b188e2a-4a56-49f9-b527-cd61a646f6c7,ed1497db-7c6f-4bd2-83ed-ee05fa286f8b,1deadf31-dc7d-4361-8041-ab0a27edebb3,eb166054-ff70-4a86-883b-9c25a7d2b0e5,0c279a15-1250-4923-bcde-96cc3fd32d42,...,09f1c0ee-ee22-435f-88c9-d9f5c936c45c,4c8d0d1a-dbd3-4124-8458-dfb61f61f11c,c96ef17c-52c1-43d9-a7e5-730fd8f3641f,7dcadaff-5d95-4ba6-9ce7-50bb2e5f17aa,6d505305-dc5f-4dc7-8721-817f444ff7ee,391a3465-d22e-424b-af09-3e6342159bbb,0caa1750-ec80-484f-9097-1e90fbfa41c2,9c0043c4-b65a-42ca-9f02-fe1cef04beaf,d3badb09-df1e-488e-ade9-97f1925b5649,fd371070-c312-4eca-b3da-d41b1c8a86a7
0,ENSG00000000003.15,TSPAN6,955,1970,1344,8421,4079,1372,2825,2700,...,1435,1548,3146,2208,5909,1779,13854,3454,1412,11016
1,ENSG00000000005.6,TNMD,3,0,19,0,25,27,19,0,...,23,24,0,2,19,31,81,14,0,2
2,ENSG00000000419.13,DPM1,3231,2926,3549,3904,2689,2135,9021,2527,...,1027,3668,1256,2342,3288,4308,4568,2743,3381,3511
3,ENSG00000000457.14,SCYL3,1763,1822,1455,2519,911,2385,3057,1093,...,1239,1009,880,1011,1306,2155,3189,1427,1237,2828
4,ENSG00000000460.17,C1orf112,899,858,757,1186,714,1075,530,575,...,690,650,338,297,854,874,1794,1219,1087,3257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,9,12,6,7,13,7,10,3,...,4,12,1,10,15,4,13,7,6,2


In [25]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this group
rna_seq_artifacts['agg_norm_reads']

Unnamed: 0,gene_id,gene_name,79f97f73-b650-49cf-8654-53038a6c0de7,6b62d6dd-bb6f-4f39-9041-992c47b875f3,e4e099ea-7207-48b8-a578-d5e6d83054b0,7b188e2a-4a56-49f9-b527-cd61a646f6c7,ed1497db-7c6f-4bd2-83ed-ee05fa286f8b,1deadf31-dc7d-4361-8041-ab0a27edebb3,eb166054-ff70-4a86-883b-9c25a7d2b0e5,0c279a15-1250-4923-bcde-96cc3fd32d42,...,09f1c0ee-ee22-435f-88c9-d9f5c936c45c,4c8d0d1a-dbd3-4124-8458-dfb61f61f11c,c96ef17c-52c1-43d9-a7e5-730fd8f3641f,7dcadaff-5d95-4ba6-9ce7-50bb2e5f17aa,6d505305-dc5f-4dc7-8721-817f444ff7ee,391a3465-d22e-424b-af09-3e6342159bbb,0caa1750-ec80-484f-9097-1e90fbfa41c2,9c0043c4-b65a-42ca-9f02-fe1cef04beaf,d3badb09-df1e-488e-ade9-97f1925b5649,fd371070-c312-4eca-b3da-d41b1c8a86a7
0,ENSG00000000003.15,TSPAN6,13.4312,19.7084,13.1846,79.1163,46.8572,17.6389,37.1006,58.5780,...,18.5563,22.3424,38.0414,26.9026,54.6471,20.8871,101.3542,33.7568,21.0121,127.9853
1,ENSG00000000005.6,TNMD,0.1297,0.0000,0.5728,0.0000,0.8826,1.0668,0.7668,0.0000,...,0.9140,1.0645,0.0000,0.0749,0.5400,1.1185,1.8211,0.4205,0.0000,0.0714
2,ENSG00000000419.13,DPM1,170.7707,110.0081,130.8396,137.8408,116.0859,103.1530,445.2281,206.0353,...,49.9086,198.9549,57.0759,107.2379,114.2748,190.0831,125.5909,100.7467,189.0799,153.2966
3,ENSG00000000457.14,SCYL3,16.3402,12.0124,9.4065,15.5964,6.8966,20.2070,26.4577,15.6274,...,10.5586,9.5972,7.0125,8.1179,7.9596,16.6742,15.3750,9.1909,12.1311,21.6526
4,ENSG00000000460.17,C1orf112,9.6066,6.5218,5.6424,8.4661,6.2319,10.5009,5.2886,9.4784,...,6.7793,7.1281,3.1054,2.7495,6.0008,7.7967,9.9721,9.0519,12.2903,28.7510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0083,0.0000,0.0000,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0597,0.0566,0.0278,0.0310,0.0704,0.0424,0.0619,0.0307,...,0.0244,0.0817,0.0057,0.0575,0.0654,0.0221,0.0448,0.0323,0.0421,0.0110


## Luminal A Files

In [26]:
# Process the files related to the luminal A group
mir_seq_artifacts = process_mir_seq_files('Luminal A')
rna_seq_artifacts = process_rna_seq_files('Luminal A')

### miRNA-Seq Artifacts

In [27]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_artifacts['sample_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175960-94175983,+,N,mature,MIMAT0000062,2,0.879730,1
1,hsa-let-7a-1,hg38,chr9,94175960-94175984,+,N,mature,MIMAT0000062,3,1.319595,1
2,hsa-let-7a-1,hg38,chr9,94175961-94175982,+,N,mature,MIMAT0000062,3,1.319595,1
3,hsa-let-7a-1,hg38,chr9,94175961-94175983,+,N,mature,MIMAT0000062,6,2.639190,1
4,hsa-let-7a-1,hg38,chr9,94175961-94175984,+,N,mature,MIMAT0000062,18,7.917569,1
...,...,...,...,...,...,...,...,...,...,...,...
2838,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,1,0.439865,1
2839,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,4,1.759460,1
2840,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,39,17.154733,1
2841,hsa-mir-99b,hg38,chr19,51692656-51692679,+,N,mature,MIMAT0004678,22,9.677029,1


In [28]:
# Print the DataFrame of aggregated microRNA raw reads for this group
mir_seq_artifacts['agg_raw_reads']

Unnamed: 0,accession_id,de633b61-eccc-4557-85ca-5449e53be967,03b7f117-ca8a-406d-b19e-11c702cc5033,3a6a7205-8a41-4691-aba5-f1e252dece0e,e8e873d3-45cd-441c-adee-6958639baa43,56152d26-ac8d-4eca-a4ac-1d649ab915f1,0c817586-34b1-4e0d-9995-b961bf552600,b9fa5653-bfd9-46d8-8526-bd31b6264152,c0eda172-ffc4-4edd-888d-eb7e09d3d9e7,275f789a-9832-44a7-9100-4be00d91891e,...,546e3c4b-cf6b-4698-a37c-19bcc395bb11,3378bf17-69b3-4fd5-8050-51ed6ce8280e,6e4ce9ff-1185-4e93-8c05-2a641b1feb0e,ac7012c0-2175-4bea-b37c-f6dcb157e2a1,9a03da54-6e88-4478-bca6-e014fe37c91b,3dd84e30-80af-43a2-bb4a-d9361a0ef7a4,b2f856e0-fe5e-4b1c-9ca7-b1f6d10282dc,ac4ebfa9-06f4-4af1-b571-b5bad342efc0,c0182973-5b1e-40c7-a932-1ea0b8e3739b,2bf2aed5-04db-4b86-80b8-afac0cabf05a
0,MIMAT0000062,79706,53257,13481,30661,48963,31724,59384,56115,88257,...,64271,85241,133664,121004,197260,39971,25478,22663,28203,50204
1,MIMAT0000063,86929,15749,15372,43384,73466,30669,49229,77992,34569,...,107812,132923,78186,26541,200517,62542,24369,14807,27221,41134
2,MIMAT0000064,5881,4249,1767,1632,370,5503,8678,4104,8554,...,9294,9060,17562,6468,10515,3824,3282,6416,3744,2585
3,MIMAT0000065,642,233,251,247,270,460,216,717,344,...,285,350,536,329,857,158,573,452,989,391
4,MIMAT0000066,2247,1407,1470,1404,830,1134,869,1793,3888,...,1324,2455,4278,6683,5745,1030,987,2451,2235,4090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,MIMAT0031893,0,7,2,1,2,6,0,10,4,...,5,2,6,8,2,1,2,5,5,1
1989,MIMAT0032026,0,3,1,1,1,1,0,1,0,...,0,2,1,1,5,0,0,1,5,2
1990,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1991,MIMAT0032110,7,5,11,70,21,21,2,3,13,...,0,4,11,7,5,8,11,3,0,3


In [29]:
# Print the DataFrame of aggregated microRNA normalized reads for this group
mir_seq_artifacts['agg_norm_reads']

Unnamed: 0,accession_id,de633b61-eccc-4557-85ca-5449e53be967,03b7f117-ca8a-406d-b19e-11c702cc5033,3a6a7205-8a41-4691-aba5-f1e252dece0e,e8e873d3-45cd-441c-adee-6958639baa43,56152d26-ac8d-4eca-a4ac-1d649ab915f1,0c817586-34b1-4e0d-9995-b961bf552600,b9fa5653-bfd9-46d8-8526-bd31b6264152,c0eda172-ffc4-4edd-888d-eb7e09d3d9e7,275f789a-9832-44a7-9100-4be00d91891e,...,546e3c4b-cf6b-4698-a37c-19bcc395bb11,3378bf17-69b3-4fd5-8050-51ed6ce8280e,6e4ce9ff-1185-4e93-8c05-2a641b1feb0e,ac7012c0-2175-4bea-b37c-f6dcb157e2a1,9a03da54-6e88-4478-bca6-e014fe37c91b,3dd84e30-80af-43a2-bb4a-d9361a0ef7a4,b2f856e0-fe5e-4b1c-9ca7-b1f6d10282dc,ac4ebfa9-06f4-4af1-b571-b5bad342efc0,c0182973-5b1e-40c7-a932-1ea0b8e3739b,2bf2aed5-04db-4b86-80b8-afac0cabf05a
0,MIMAT0000062,35059.876622,40970.673466,13764.296764,33688.556926,31177.965867,22106.300331,47066.096541,20827.031527,39209.370772,...,16714.692167,32910.579180,36232.104060,60373.555272,49932.856684,17423.785379,16061.882194,9449.480117,12976.645452,30690.587122
1,MIMAT0000063,38237.021235,12115.724438,15695.035218,47667.863198,46780.639264,21371.142510,39017.527729,28946.660305,15357.747690,...,28038.219279,51320.056264,21193.764127,13242.326954,50757.308242,27262.725106,15362.744608,6173.871601,12524.811744,25145.936788
2,MIMAT0000064,2586.845839,3268.760755,1804.132659,1793.148459,235.603362,3834.666835,6877.939946,1523.195889,3800.230664,...,2417.051999,3497.962806,4760.505535,3227.134276,2661.685022,1666.922401,2069.043777,2675.191477,1722.673495,1580.255911
3,MIMAT0000065,282.393305,179.247178,256.274645,271.389504,171.926779,320.542748,171.195555,266.113900,152.826671,...,74.118767,135.131015,145.292736,164.150774,216.934292,68.873887,361.231593,188.464238,455.054511,239.025169
4,MIMAT0000066,988.376568,1082.406775,1500.891348,1542.635072,528.515647,790.207557,688.745080,665.470329,1727.296799,...,344.327184,947.847537,1159.631174,3334.406054,1454.244455,448.987990,622.226146,1021.959838,1028.358778,2500.288851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,MIMAT0031893,0.000000,5.385109,2.042030,1.098743,1.273532,4.180992,0.000000,3.711490,1.777055,...,1.300330,0.772177,1.626411,3.991510,0.506264,0.435911,1.260844,2.084781,2.300579,0.611318
1989,MIMAT0032026,0.000000,2.307903,1.021015,1.098743,0.636766,0.696832,0.000000,0.371149,0.000000,...,0.000000,0.772177,0.271069,0.498939,1.265661,0.000000,0.000000,0.416956,2.300579,1.222636
1990,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.460116,0.000000
1991,MIMAT0032110,3.079055,3.846505,11.231162,76.912006,13.372083,14.633473,1.585144,1.113447,5.775428,...,0.000000,1.544355,2.981756,3.492571,1.265660,3.487285,6.934637,1.250869,0.000000,1.833954


### RNA-Seq Artifacts

In [30]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_artifacts['sample_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,17957,9140,8817,194.7184,54.0146,51.6803,1
1,ENSG00000000005.6,TNMD,protein_coding,22,13,10,0.7331,0.2034,0.1946,1
2,ENSG00000000419.13,DPM1,protein_coding,2637,1283,1354,107.4605,29.8094,28.5212,1
3,ENSG00000000457.14,SCYL3,protein_coding,1883,1507,1404,13.4561,3.7327,3.5714,1
4,ENSG00000000460.17,C1orf112,protein_coding,675,896,892,5.5613,1.5427,1.4760,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,446,245,228,12.1401,3.3676,3.2221,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,11,4,7,0.0562,0.0156,0.0149,1


In [31]:
# Print the DataFrame of aggregated messenger RNA raw reads for this group
rna_seq_artifacts['agg_raw_reads']

Unnamed: 0,gene_id,gene_name,d923e06d-c747-419d-bd40-b05eb3e13edb,870fb4d4-8366-4bfa-b4c1-04d55eded427,0022cd20-f64f-4773-b9ff-a3de0b71b259,2dc5eb24-fb84-455a-ae05-3b58ad2fc284,c1d62c39-f3ab-47d3-b654-8b6285cd97a3,aa2ab118-b3cc-4452-a29c-104f80f16f8d,0ca579b1-763e-4f6a-a071-c6514aaa2f11,57a43ef5-102a-4cd9-833d-00a2daf4e56a,...,e21262db-a694-4461-b1b0-e52c0891113a,40ad8a2a-abd2-4701-b984-4afa4c27c260,e5b0c2dc-c652-40a0-bb80-d7e87830b406,cc7ea1eb-4947-4030-ace5-673e9441a9a8,6bd03014-d3b8-4cac-929b-af970321ae1f,36e4ffb0-0bab-4035-88f1-a181b195fce1,196aed5b-6812-47c5-ad7b-4dd1743ed0b5,afedbcf7-d427-4371-9c2a-f349b03de522,15d7e135-acb7-46ef-a8b1-f424e40a9e4d,a6f96d96-c761-4e98-9120-25138aa43e67
0,ENSG00000000003.15,TSPAN6,17957,1505,2443,621,5053,2393,3451,6971,...,2056,40780,2898,4243,2590,3512,2177,2683,3531,502
1,ENSG00000000005.6,TNMD,22,21,144,0,27,4,15,9,...,9,18,27,85,11,69,33,96,58,12
2,ENSG00000000419.13,DPM1,2637,1741,2322,1214,1619,1354,7829,2826,...,1873,1518,1312,1590,4995,1025,1502,2728,2507,3234
3,ENSG00000000457.14,SCYL3,1883,1947,1466,1087,1008,1196,948,3381,...,1128,1441,2001,3535,2465,777,1027,1765,4480,3899
4,ENSG00000000460.17,C1orf112,675,884,409,187,346,328,534,1353,...,447,623,767,678,679,122,457,624,1003,1511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,11,3,0,1,14,4,4,16,...,5,6,10,12,1,5,5,5,6,4


In [32]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this group
rna_seq_artifacts['agg_norm_reads']

Unnamed: 0,gene_id,gene_name,d923e06d-c747-419d-bd40-b05eb3e13edb,870fb4d4-8366-4bfa-b4c1-04d55eded427,0022cd20-f64f-4773-b9ff-a3de0b71b259,2dc5eb24-fb84-455a-ae05-3b58ad2fc284,c1d62c39-f3ab-47d3-b654-8b6285cd97a3,aa2ab118-b3cc-4452-a29c-104f80f16f8d,0ca579b1-763e-4f6a-a071-c6514aaa2f11,57a43ef5-102a-4cd9-833d-00a2daf4e56a,...,e21262db-a694-4461-b1b0-e52c0891113a,40ad8a2a-abd2-4701-b984-4afa4c27c260,e5b0c2dc-c652-40a0-bb80-d7e87830b406,cc7ea1eb-4947-4030-ace5-673e9441a9a8,6bd03014-d3b8-4cac-929b-af970321ae1f,36e4ffb0-0bab-4035-88f1-a181b195fce1,196aed5b-6812-47c5-ad7b-4dd1743ed0b5,afedbcf7-d427-4371-9c2a-f349b03de522,15d7e135-acb7-46ef-a8b1-f424e40a9e4d,a6f96d96-c761-4e98-9120-25138aa43e67
0,ENSG00000000003.15,TSPAN6,194.7184,22.0943,28.5350,13.4142,82.0531,48.2822,58.3262,63.0532,...,35.9726,678.0519,50.2267,63.4055,34.7451,45.6298,37.7646,26.1556,37.8819,5.3684
1,ENSG00000000005.6,TNMD,0.7331,0.9474,5.1690,0.0000,1.3474,0.2480,0.7791,0.2502,...,0.4839,0.9198,1.4381,3.9035,0.4535,2.7551,1.7592,2.8761,1.9123,0.3944
2,ENSG00000000419.13,DPM1,107.4605,96.0525,101.9253,98.5499,98.8004,102.6666,497.2679,96.0617,...,123.1551,94.8535,85.4547,89.2929,251.8228,50.0477,97.9179,99.9435,101.0775,129.9716
3,ENSG00000000457.14,SCYL3,13.4561,18.8367,11.2845,15.4738,10.7870,15.9027,10.5590,20.1536,...,13.0063,15.7897,22.8549,34.8128,21.7924,6.6529,11.7406,11.3393,31.6744,27.4784
4,ENSG00000000460.17,C1orf112,5.5613,9.8604,3.6297,3.0691,4.2689,5.0283,6.8574,9.2984,...,5.9423,7.8705,10.1002,7.6981,6.9209,1.2044,6.0234,4.6220,8.1759,12.2774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0244,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0562,0.0208,0.0000,0.0102,0.1072,0.0381,0.0319,0.0682,...,0.0412,0.0470,0.0817,0.0846,0.0063,0.0306,0.0409,0.0230,0.0304,0.0202


## Luminal B Files

In [33]:
# Process the files related to the luminal B group
mir_seq_artifacts = process_mir_seq_files('Luminal B')
rna_seq_artifacts = process_rna_seq_files('Luminal B')

### miRNA-Seq Artifacts

In [34]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_artifacts['sample_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175942-94175961,+,N,precursor,,1,0.620599,0
1,hsa-let-7a-1,hg38,chr9,94175960-94175983,+,N,mature,MIMAT0000062,1,0.620599,1
2,hsa-let-7a-1,hg38,chr9,94175961-94175981,+,N,mature,MIMAT0000062,1,0.620599,1
3,hsa-let-7a-1,hg38,chr9,94175961-94175982,+,N,mature,MIMAT0000062,2,1.241198,1
4,hsa-let-7a-1,hg38,chr9,94175961-94175983,+,N,mature,MIMAT0000062,5,3.102994,1
...,...,...,...,...,...,...,...,...,...,...,...
3187,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,15,9.308982,1
3188,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,7,4.344192,1
3189,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,35,21.720958,1
3190,hsa-mir-99b,hg38,chr19,51692656-51692679,+,N,mature,MIMAT0004678,6,3.723593,1


In [35]:
# Print the DataFrame of aggregated microRNA raw reads for this group
mir_seq_artifacts['agg_raw_reads']

Unnamed: 0,accession_id,054fdd39-3fa4-4676-8341-6b66da8dc3f5,828a2b26-d2d0-4f4f-8ab3-22f3bf82b61a,bad24748-3268-4e31-8ae1-35a84c16acbf,91a97f93-53f2-4571-a4bc-233643a1da0b,83d0a04c-e990-4d4f-af6e-666d5e521d9b,b11194e9-d6e1-4f0e-b88c-251a369fba1e,42f12ac2-0c43-4fea-bf8f-edc15cd33d47,9d110436-8434-4974-b3ca-311c11a447e7,c138edb1-07ef-40ee-bb5e-591cffd94699,...,d3a5c8df-e9e4-414d-9584-1fc13845edb6,d06292bd-c8bb-47b2-9506-6f3c9e021d27,e12c0da3-79fe-4fef-aa84-195e53b32a44,04ab7272-e2e2-47a8-8bc1-6c10f4b0624b,af9371a0-7b65-4e61-ba72-572c056f6265,1e30fd9b-77dc-451e-9894-4740b51fe6b8,571f1dd9-cf46-472e-96b4-0835b2d711a8,404cd7fd-7dbb-4fc9-b3f1-e3b523f1b322,d98594fb-1051-44f6-b016-8c2d9284383b,15a656bc-0f5f-4f06-80ad-0584bf873805
0,MIMAT0000062,70477,42843,32347,81554,35970,26450,10070,54721,14694,...,143779,64517,23661,28808,33161,96681,21301,45269,55792,25753
1,MIMAT0000063,56118,125604,23150,109342,44180,18924,13291,37915,12511,...,249113,100674,29934,8308,24684,79526,23921,27480,32912,27100
2,MIMAT0000064,621,1775,4010,5503,3550,1760,490,4919,2560,...,4100,4057,763,1210,1739,3500,2234,399,465,1403
3,MIMAT0000065,141,714,242,261,361,198,96,371,282,...,749,1016,633,128,635,364,250,314,285,287
4,MIMAT0000066,1261,1258,2199,1605,1561,1300,1591,1496,1520,...,2287,3093,1820,739,1381,2290,1643,985,1836,1064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,MIMAT0031892,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1864,MIMAT0031893,5,14,3,1,4,2,4,3,1,...,7,6,7,1,0,0,3,3,2,1
1865,MIMAT0032026,0,7,0,0,0,1,1,1,6,...,0,0,2,6,2,0,1,1,5,0
1866,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# Print the DataFrame of aggregated microRNA normalized reads for this group
mir_seq_artifacts['agg_norm_reads']

Unnamed: 0,accession_id,054fdd39-3fa4-4676-8341-6b66da8dc3f5,828a2b26-d2d0-4f4f-8ab3-22f3bf82b61a,bad24748-3268-4e31-8ae1-35a84c16acbf,91a97f93-53f2-4571-a4bc-233643a1da0b,83d0a04c-e990-4d4f-af6e-666d5e521d9b,b11194e9-d6e1-4f0e-b88c-251a369fba1e,42f12ac2-0c43-4fea-bf8f-edc15cd33d47,9d110436-8434-4974-b3ca-311c11a447e7,c138edb1-07ef-40ee-bb5e-591cffd94699,...,d3a5c8df-e9e4-414d-9584-1fc13845edb6,d06292bd-c8bb-47b2-9506-6f3c9e021d27,e12c0da3-79fe-4fef-aa84-195e53b32a44,04ab7272-e2e2-47a8-8bc1-6c10f4b0624b,af9371a0-7b65-4e61-ba72-572c056f6265,1e30fd9b-77dc-451e-9894-4740b51fe6b8,571f1dd9-cf46-472e-96b4-0835b2d711a8,404cd7fd-7dbb-4fc9-b3f1-e3b523f1b322,d98594fb-1051-44f6-b016-8c2d9284383b,15a656bc-0f5f-4f06-80ad-0584bf873805
0,MIMAT0000062,43737.940990,20727.311258,14596.119286,52517.902222,18659.416621,29639.374534,13442.672413,43455.921470,13000.203490,...,41188.982773,18652.310781,15699.406880,23470.732066,24401.407240,52046.917110,24870.515929,34893.490411,40261.695243,21406.478716
1,MIMAT0000063,34826.762951,60766.827787,10446.105093,70412.395035,22918.349357,21205.879915,17742.458689,30109.670193,11068.840740,...,71364.462584,29105.549466,19861.630769,6768.774019,18163.636085,42811.753395,27929.562532,21181.672148,23750.589940,22526.135719
2,MIMAT0000064,385.391851,858.739526,1809.454920,3543.738087,1841.560438,1972.223032,654.112162,3906.355470,2264.905469,...,1174.544469,1172.906752,506.261250,985.822891,1279.637142,1884.177969,2608.362639,307.550479,335.562236,1166.205478
3,MIMAT0000065,87.504430,345.430998,109.199026,168.074804,187.268540,221.875091,128.152588,294.624493,249.493494,...,214.569221,293.732628,420.004417,104.285397,467.262555,195.954508,291.893761,242.032208,205.667174,238.560920
4,MIMAT0000066,782.575077,608.616519,992.267175,1033.563445,809.767844,1456.755649,2123.862146,1188.027603,1344.787622,...,655.166635,894.207686,1207.595644,602.085221,1016.204078,1232.790728,1918.325791,759.241160,1324.929603,884.420975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,MIMAT0031892,0.000000,0.483797,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1864,MIMAT0031893,3.102994,6.773157,1.353707,0.643965,2.074997,2.241162,5.339691,2.382408,0.884729,...,2.005320,1.734642,4.644598,0.814730,0.000000,0.000000,3.502725,2.312409,1.443278,0.831223
1865,MIMAT0032026,0.000000,3.386579,0.000000,0.000000,0.000000,1.120581,1.334923,0.794136,5.308372,...,0.000000,0.000000,1.327028,4.888378,1.471693,0.000000,1.167575,0.770803,3.608196,0.000000
1866,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### RNA-Seq Artifacts

In [37]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_artifacts['sample_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,285,132,153,6.2506,2.3627,2.3285,1
1,ENSG00000000005.6,TNMD,protein_coding,5,2,3,0.3370,0.1274,0.1255,1
2,ENSG00000000419.13,DPM1,protein_coding,1579,759,820,130.1438,49.1945,48.4822,1
3,ENSG00000000457.14,SCYL3,protein_coding,473,489,398,6.8365,2.5842,2.5468,1
4,ENSG00000000460.17,C1orf112,protein_coding,291,332,408,4.8492,1.8330,1.8065,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,331,173,170,18.2229,6.8883,6.7886,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,1,0,1,0.0103,0.0039,0.0039,1


In [38]:
# Print the DataFrame of aggregated messenger RNA raw reads for this group
rna_seq_artifacts['agg_raw_reads']

Unnamed: 0,gene_id,gene_name,0d669e80-cb31-4afb-bd3f-07d69f10eb3f,1db8a1bb-fef2-4e06-9cd5-a1f6579573a5,df176b72-7266-4eeb-a3ed-8f09199c1963,7d5e390d-86f3-4988-9606-aeedabd31b52,ccc7b8e5-7ea4-4efc-81c1-ef6c15acd241,c003fb3b-179c-49b0-8d9a-656c4dc39901,9c87ef50-5854-4f69-820b-9e0b4d46c982,318c3a49-64bc-4704-8739-d4cf31cef51f,...,f5e5f153-f422-4c2a-ae54-fc11f416c637,89dfc6f3-9f78-4e03-a2e7-e9010c7814a2,cd47d9e9-8072-4681-93ac-6c1420f6624c,ad18b7e0-57ec-41c5-99a1-0c9d8740e0f1,fc203d80-6254-402c-8c8b-a5500bfa909e,56e43b33-6dc7-496b-bdee-75e5fed5d502,04024f75-7405-4ccc-8976-91d0a3035b3a,1a8d6cfd-095d-4461-875a-56d27816a0f5,27513ce4-3807-481f-abcb-c59774a90519,5d720a05-3673-45be-9163-08e0fc332f80
0,ENSG00000000003.15,TSPAN6,285,2060,1445,1713,3310,1061,2889,83,...,2599,1335,648,16555,154,522,2846,1578,2402,983
1,ENSG00000000005.6,TNMD,5,14,11,248,23,9,2,1,...,12,65,11,24,1,0,0,4,61,15
2,ENSG00000000419.13,DPM1,1579,5384,4161,1400,2302,2075,6050,2163,...,1662,1529,2982,1788,2276,3617,2074,2461,1666,2503
3,ENSG00000000457.14,SCYL3,473,2087,3274,1082,1065,2592,1014,626,...,1807,1281,1223,3761,908,1512,1700,1797,1200,1264
4,ENSG00000000460.17,C1orf112,291,1382,1114,526,669,758,1220,491,...,572,718,517,2675,569,632,692,1200,349,413
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,1,5,24,3,7,8,10,8,...,11,6,2,18,15,5,8,10,5,15


In [39]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this group
rna_seq_artifacts['agg_norm_reads']

Unnamed: 0,gene_id,gene_name,0d669e80-cb31-4afb-bd3f-07d69f10eb3f,1db8a1bb-fef2-4e06-9cd5-a1f6579573a5,df176b72-7266-4eeb-a3ed-8f09199c1963,7d5e390d-86f3-4988-9606-aeedabd31b52,ccc7b8e5-7ea4-4efc-81c1-ef6c15acd241,c003fb3b-179c-49b0-8d9a-656c4dc39901,9c87ef50-5854-4f69-820b-9e0b4d46c982,318c3a49-64bc-4704-8739-d4cf31cef51f,...,f5e5f153-f422-4c2a-ae54-fc11f416c637,89dfc6f3-9f78-4e03-a2e7-e9010c7814a2,cd47d9e9-8072-4681-93ac-6c1420f6624c,ad18b7e0-57ec-41c5-99a1-0c9d8740e0f1,fc203d80-6254-402c-8c8b-a5500bfa909e,56e43b33-6dc7-496b-bdee-75e5fed5d502,04024f75-7405-4ccc-8976-91d0a3035b3a,1a8d6cfd-095d-4461-875a-56d27816a0f5,27513ce4-3807-481f-abcb-c59774a90519,5d720a05-3673-45be-9163-08e0fc332f80
0,ENSG00000000003.15,TSPAN6,6.2506,27.6964,17.1757,32.0488,64.8954,15.5696,44.3385,1.2382,...,38.1891,22.6398,14.1538,196.5149,1.9903,5.9777,46.8800,24.7948,28.7472,8.4043
1,ENSG00000000005.6,TNMD,0.3370,0.5785,0.4018,14.2591,1.3858,0.4059,0.0943,0.0458,...,0.5419,3.3876,0.7384,0.8755,0.0397,0.0000,0.0000,0.1932,2.2436,0.3941
2,ENSG00000000419.13,DPM1,130.1438,272.0362,185.8704,98.4347,169.6122,114.4318,348.9431,121.2661,...,91.7763,97.4462,244.7773,79.7627,110.5433,155.6594,128.3889,145.3217,74.9313,80.4221
3,ENSG00000000457.14,SCYL3,6.8365,18.4916,25.6461,13.3406,13.7604,25.0665,10.2557,6.1544,...,17.4979,14.3165,17.6043,29.4215,7.7335,11.4106,18.4543,18.6079,9.4645,7.1218
4,ENSG00000000460.17,C1orf112,4.8492,14.1177,10.0608,7.4772,9.9658,8.4514,14.2263,5.5654,...,6.3860,9.2516,8.5800,24.1262,5.5873,5.4989,8.6608,14.3263,3.1736,2.6829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0000,0.0000,0.0223,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0103,0.0317,0.1345,0.0265,0.0647,0.0554,0.0724,0.0563,...,0.0762,0.0480,0.0206,0.1007,0.0914,0.0270,0.0621,0.0741,0.0282,0.0605


## Paired Normal Files

In [40]:
# Process the files related to the paired normal group
mir_seq_artifacts = process_mir_seq_files('Paired Normal')
rna_seq_artifacts = process_rna_seq_files('Paired Normal')

### miRNA-Seq Artifacts

In [41]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_artifacts['sample_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175961-94175981,+,N,mature,MIMAT0000062,1,0.541832,1
1,hsa-let-7a-1,hg38,chr9,94175961-94175982,+,N,mature,MIMAT0000062,2,1.083665,1
2,hsa-let-7a-1,hg38,chr9,94175961-94175983,+,N,mature,MIMAT0000062,18,9.752984,1
3,hsa-let-7a-1,hg38,chr9,94175961-94175984,+,N,mature,MIMAT0000062,19,10.294816,1
4,hsa-let-7a-1,hg38,chr9,94175961-94175985,+,N,mature,MIMAT0000062,1,0.541832,1
...,...,...,...,...,...,...,...,...,...,...,...
3545,hsa-mir-99b,hg38,chr19,51692656-51692675,+,N,mature,MIMAT0004678,12,6.501989,1
3546,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,6,3.250995,1
3547,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,6,3.250995,1
3548,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,33,17.880471,1


In [42]:
# Print the DataFrame of aggregated microRNA raw reads for this group
mir_seq_artifacts['agg_raw_reads']

Unnamed: 0,accession_id,64efe402-841b-49bc-a370-02a24880dd19,4ebd02b1-6eb1-4923-ad2f-5f4f107e6622,03589e93-5465-4e5b-91aa-be97a386e097,84d88e83-f246-45c8-a27c-5f146cb5a4b4,308d2039-4905-49de-861a-c70f97669068,7e17b2bf-91b2-4ea2-acc1-59e1537913ce,f041b9ca-ace6-4425-a928-1ffbf2737243,29cea98e-c370-473d-af6d-393e87c1bd31,7c3c4540-a63e-4737-a23d-a9c999c53f39,...,f52a43ab-8cf4-4611-a13a-195778748214,c786cbe6-bedd-4ff1-9c19-8026aaabc738,8e5ae73e-8a65-4060-874e-9c0ab8311fcb,afd9ffe6-e019-4ea3-ab5f-41a57f9db4de,85fb34fe-c751-4806-87a9-419e9ed39dfc,15a6e18f-00b4-4369-8588-294605e0e952,3ef8d2e0-14fe-438a-bbcd-d0382a4085de,485f30ed-85f1-4402-ad3e-8e1ff3a75a36,cea36b7b-8225-4b98-8371-66ee50eb0641,9b71b456-248a-4ec9-8fd3-bc0c1ea2d852
0,MIMAT0000062,112490,64753,63380,73447,83952,201136,39350,31320,76427,...,118394,234952,257696,164020,77955,201236,75632,278788,81162,106563
1,MIMAT0000063,73134,112902,112408,54023,76506,135669,81791,39234,50010,...,224907,130297,188689,74862,162405,193054,113115,164428,64849,93543
2,MIMAT0000064,17224,18826,23698,11915,18203,34636,11505,7395,14022,...,36602,28261,32688,21436,12999,34759,25432,47455,2142,24173
3,MIMAT0000065,293,394,272,482,413,551,369,116,148,...,580,410,434,838,339,648,267,439,297,366
4,MIMAT0000066,2222,1984,1908,2441,3570,3050,1523,719,1261,...,2110,5969,5633,5247,1842,3462,1817,6675,1880,3005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1626,MIMAT0031893,1,5,2,2,1,6,0,0,1,...,10,3,3,5,3,4,0,0,0,0
1627,MIMAT0032026,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,2,0
1628,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1629,MIMAT0032110,4,3,2,7,9,11,2,4,4,...,13,0,1,5,2,6,0,2,15,22


In [43]:
# Print the DataFrame of aggregated microRNA normalized reads for this group
mir_seq_artifacts['agg_norm_reads']

Unnamed: 0,accession_id,64efe402-841b-49bc-a370-02a24880dd19,4ebd02b1-6eb1-4923-ad2f-5f4f107e6622,03589e93-5465-4e5b-91aa-be97a386e097,84d88e83-f246-45c8-a27c-5f146cb5a4b4,308d2039-4905-49de-861a-c70f97669068,7e17b2bf-91b2-4ea2-acc1-59e1537913ce,f041b9ca-ace6-4425-a928-1ffbf2737243,29cea98e-c370-473d-af6d-393e87c1bd31,7c3c4540-a63e-4737-a23d-a9c999c53f39,...,f52a43ab-8cf4-4611-a13a-195778748214,c786cbe6-bedd-4ff1-9c19-8026aaabc738,8e5ae73e-8a65-4060-874e-9c0ab8311fcb,afd9ffe6-e019-4ea3-ab5f-41a57f9db4de,85fb34fe-c751-4806-87a9-419e9ed39dfc,15a6e18f-00b4-4369-8588-294605e0e952,3ef8d2e0-14fe-438a-bbcd-d0382a4085de,485f30ed-85f1-4402-ad3e-8e1ff3a75a36,cea36b7b-8225-4b98-8371-66ee50eb0641,9b71b456-248a-4ec9-8fd3-bc0c1ea2d852
0,MIMAT0000062,60950.731706,30125.815394,27341.392529,28001.197110,46630.518467,56803.204359,20024.823655,35655.250612,42774.191206,...,36372.495345,68632.648873,75542.717038,57772.052943,29467.186554,69883.414205,45380.824626,69339.468378,55732.567177,38876.296528
1,MIMAT0000063,39626.374017,52526.752577,48491.499709,20595.921839,42494.692747,38314.543058,41622.626471,44664.690374,27989.287844,...,69094.960988,38061.511504,55313.546721,26368.317444,61389.499469,67042.043400,67871.429795,40896.129336,44530.707090,34126.342221
2,MIMAT0000064,9332.522029,8758.645939,10223.040710,4542.517238,10110.721933,9781.619335,5854.780079,8418.600839,7847.746337,...,11244.708977,8255.419359,9582.377434,7550.309275,4913.654776,12070.790488,15259.746296,11802.891345,1470.875027,8818.789975
3,MIMAT0000065,158.756907,183.305348,117.337626,183.759406,229.397800,155.608970,187.780429,132.056484,82.831726,...,178.185105,119.766528,127.225643,295.165105,128.142856,225.031568,160.205734,109.187006,203.944856,133.524064
4,MIMAT0000066,1203.951691,923.040132,823.088938,930.615577,1982.930139,861.356363,775.039552,818.522516,705.748690,...,648.225121,1743.625426,1651.295034,1848.128043,696.280645,1202.251983,1090.239030,1660.189649,1290.964076,1096.283618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1626,MIMAT0031893,0.541832,2.326210,0.862776,0.762488,0.555443,1.694472,0.000000,0.000000,0.559674,...,3.072157,0.876340,0.879440,1.761129,1.134008,1.389084,0.000000,0.000000,0.000000,0.000000
1627,MIMAT0032026,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.292113,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.373366,0.000000
1628,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.307216,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1629,MIMAT0032110,2.167329,1.395726,0.862776,2.668706,4.998986,3.106531,1.017780,4.553672,2.238695,...,3.993805,0.000000,0.293147,1.761128,0.756005,2.083626,0.000000,0.497436,10.300245,8.026038


### RNA-Seq Artifacts

In [44]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_artifacts['sample_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,5005,2575,2430,84.4328,22.3634,21.0739,1
1,ENSG00000000005.6,TNMD,protein_coding,235,117,118,12.1832,3.2269,3.0409,1
2,ENSG00000000419.13,DPM1,protein_coding,1762,887,875,111.7066,29.5873,27.8813,1
3,ENSG00000000457.14,SCYL3,protein_coding,2052,1528,1465,22.8129,6.0424,5.6940,1
4,ENSG00000000460.17,C1orf112,protein_coding,361,666,686,4.6271,1.2256,1.1549,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,426,219,225,18.0398,4.7781,4.5026,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,5,3,2,0.0398,0.0105,0.0099,1


In [45]:
# Print the DataFrame of aggregated messenger RNA raw reads for this group
rna_seq_artifacts['agg_raw_reads']

Unnamed: 0,gene_id,gene_name,3aad7b0b-9f82-41d0-b3e5-5614afcac6a8,0e89f9f0-419f-46e7-981d-781a1302e4be,8a84b9a5-d453-416b-b481-f15402c2eb54,38854c85-fc09-4a51-93a7-257762517583,baec6a46-7c48-41ed-a8a9-eef52d32cba3,b70b68a1-28c4-4ed3-a04c-b622e583f10b,23bf74db-bb4e-44c5-8473-e651b818e460,a9f49b86-3801-420d-915c-0e09a15c6e59,...,934935ae-037b-451d-a696-0679b07bd401,da84f488-af86-4472-bf87-063a9137e1db,2bf56d2d-8c5e-4579-847b-03fd0ba46143,b01d7dfd-b577-48b8-8bf2-3c74bb617601,542856ab-164d-4edd-833a-925da96d1eeb,1479c033-ebe7-423d-8460-bbe84fd5ffb6,76cff555-d8a3-46d1-ac93-12d856adb445,d8ac1c2a-1820-4174-99e3-524b334df490,3071e512-94ea-4820-9573-668235188e34,68881256-49b8-4a19-87a9-afac4f1841d2
0,ENSG00000000003.15,TSPAN6,5005,3603,2829,8064,3607,3919,5383,5045,...,7996,5117,5175,3838,4706,9352,3180,10613,3820,4158
1,ENSG00000000005.6,TNMD,235,760,240,1408,3612,46,319,154,...,236,108,85,1005,722,176,1319,1465,105,75
2,ENSG00000000419.13,DPM1,1762,1322,1067,2647,1246,1242,2465,1574,...,2052,1900,2406,1839,1766,1873,1090,4030,1705,1456
3,ENSG00000000457.14,SCYL3,2052,947,928,2368,670,982,1191,1651,...,1973,1200,2222,844,1607,2095,542,3392,1678,1446
4,ENSG00000000460.17,C1orf112,361,231,255,590,202,222,272,287,...,399,294,476,178,383,440,121,1082,292,299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,2,2,0,6,1,...,0,0,0,0,2,1,0,1,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,5,9,12,10,18,4,5,9,...,11,6,8,12,11,4,8,27,3,6


In [46]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this group
rna_seq_artifacts['agg_norm_reads']

Unnamed: 0,gene_id,gene_name,3aad7b0b-9f82-41d0-b3e5-5614afcac6a8,0e89f9f0-419f-46e7-981d-781a1302e4be,8a84b9a5-d453-416b-b481-f15402c2eb54,38854c85-fc09-4a51-93a7-257762517583,baec6a46-7c48-41ed-a8a9-eef52d32cba3,b70b68a1-28c4-4ed3-a04c-b622e583f10b,23bf74db-bb4e-44c5-8473-e651b818e460,a9f49b86-3801-420d-915c-0e09a15c6e59,...,934935ae-037b-451d-a696-0679b07bd401,da84f488-af86-4472-bf87-063a9137e1db,2bf56d2d-8c5e-4579-847b-03fd0ba46143,b01d7dfd-b577-48b8-8bf2-3c74bb617601,542856ab-164d-4edd-833a-925da96d1eeb,1479c033-ebe7-423d-8460-bbe84fd5ffb6,76cff555-d8a3-46d1-ac93-12d856adb445,d8ac1c2a-1820-4174-99e3-524b334df490,3071e512-94ea-4820-9573-668235188e34,68881256-49b8-4a19-87a9-afac4f1841d2
0,ENSG00000000003.15,TSPAN6,84.4328,64.4346,65.8003,86.9116,45.4672,63.3806,62.8648,88.7889,...,111.2644,88.5399,58.4397,37.8217,60.9318,131.3761,36.5183,99.3106,70.0346,72.5826
1,ENSG00000000005.6,TNMD,12.1832,41.7691,17.1551,46.6355,139.9220,2.2863,11.4488,8.3292,...,10.0921,5.7429,2.9499,30.4360,28.7287,7.5982,46.5495,42.1290,5.9160,4.0234
2,ENSG00000000419.13,DPM1,111.7066,88.8489,93.2663,107.2129,59.0249,75.4864,108.1847,104.1042,...,107.3066,123.5501,102.1078,68.1056,85.9308,98.8816,47.0410,141.7190,117.4734,95.5158
3,ENSG00000000457.14,SCYL3,22.8129,11.1609,14.2245,16.8192,5.5657,10.4662,9.1662,19.1487,...,18.0928,13.6836,16.5362,5.4812,13.7121,19.3951,4.1018,20.9174,20.2739,16.6346
4,ENSG00000000460.17,C1orf112,4.6271,3.1388,4.5064,4.8315,1.9346,2.7279,2.4135,3.8378,...,4.2185,3.8652,4.0842,1.3328,3.7678,4.6964,1.0558,7.6928,4.0675,3.9657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0000,0.0245,0.0287,0.0000,0.0797,0.0200,...,0.0000,0.0000,0.0000,0.0000,0.0295,0.0160,0.0000,0.0106,0.0000,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0398,0.0759,0.1316,0.0508,0.1070,0.0305,0.0275,0.0747,...,0.0722,0.0490,0.0426,0.0558,0.0672,0.0265,0.0433,0.1191,0.0259,0.0494
