# **Retrieval and Aggregation of Expressions from the TCGA-BRCA Files of Interest**

This notebook produces the AT_FD family of data artifacts 

- TCGA: The Cancer Genome Atlas  
- BRCA: Breast Invasive Carcinoma

# Import Libraries and Configurations

In [1]:
import logging
import os
import sys
from time import sleep

import numpy as np
import pandas as pd
import requests

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..')))

from config import (
    AGGREGATED_READS_FILES,
    BRCA_INTERIM_FILES_DIRS,
    BRCA_RAW_FILES_DIRS,
    BRCA_PROCESSED_FILES_DIRS,
    BRCA_PROCESSED_FILES_PATHS,
    GDC_API_ENDPOINTS,
)

# Expression Retrieval

## Functions

In [2]:
def gdc_api_files_download(files, dir_path, retries=5, delay=5):
    """
    Download files from the Genomic Data Commons (GDC) API and save them locally.

    Parameters:
    -----------
    files : list of dict
        A list of dictionaries where each dictionary contains metadata for a file. 
        Expected keys in each dictionary:
        - 'file_id' (str): unique identifier for the file.
        - 'experimental_strategy' (str): description of the experiment type.
        - 'data_format' (str): file format (e.g., 'TXT', 'CSV').
    dir_path : str
        The local directory where downloaded files will be saved.
    retries : int
        Number of times to retry a download on failure (default is 5).
    delay : int
        Seconds to wait between download retries (default is 5).
    """
    # Download each file contained in the list
    for file in files:
        # Define all file metadata
        file_id = file['file_id']
        file_type = file['experimental_strategy'].lower()
        file_format = file['data_format'].lower()
        file_name = f'{file_type}_{file_id}.{file_format}'
        file_path = os.path.join(dir_path, file_name)

        # Retry logic
        for attempt in range(retries):
            try:
                # Request the file donwload
                logging.info(f'Downloading file {file_id} [attempt {attempt + 1}]...')
                response = requests.get(
                    url=os.path.join(GDC_API_ENDPOINTS['data'], file_id),
                    headers={'Content-Type': 'application/json'},
                    timeout=30
                )
                response.raise_for_status()

                # Write the file in the group raw data directory
                with open(file_path, 'wb') as output_file:
                    output_file.write(response.content)

                # Exit retry loop on success
                logging.info(f'Downloaded and saved to {file_path}')
                break

            except requests.exceptions.RequestException as req_err:
                logging.error(f'Request failed: {req_err}')
            except OSError as os_err:
                logging.error(f'File write failed: {os_err}')
                break
            except Exception as e:
                logging.error(f'Unexpected error: {e}')

            if attempt < retries - 1:
                logging.info(f'Retrying in {delay} seconds...')
                sleep(delay)
            else:
                logging.warning(f'Failed to download the file after {retries} attempts')

In [3]:
def download_group_files(df_files, group):
    """
    Download all files associated with a specific group.

    Parameters:
    -----------
    df_files : DataFrame
        DataFrame containing file metadata, including 'group'.
    group : str
        Name of the group to filter files by.
    """
    # Select all files related to the group
    files = df_files \
        .query(f'group == "{group}"') \
        .to_dict(orient='records')

    # Raw directory path to write the files
    dir_base_name = (group.lower()).replace(' ', '-')
    path = BRCA_RAW_FILES_DIRS[dir_base_name]

    # Download all files of interest related to the group
    gdc_api_files_download(files, path)

## Load and Prepare the Metadata

In [4]:
# Create a DataFrame with the cases of interest
df_cases = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['cases']) \
    .query('is_case_of_interest == 1')
    
# Create DataFrame with the files of interest
df_files = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['files']) \
    .query('is_file_of_interest == 1')

# Create a DataFrame with files and cases of interest
df_files_with_group = df_files \
    .merge(
        right=df_cases,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    )

# Characterizes the group to which the file belongs
df_files_with_group['group'] = np.where(
    df_files_with_group['is_tumor_file_of_interest'] == 1,
    df_files_with_group['pam50_mrna'],
    'Paired Normal'
)

# Drop the unnecessary columns
df_files_with_group = df_files_with_group \
    [['file_id', 'experimental_strategy', 'data_format', 'group']]

In [5]:
# Print the DataFrame that associates each file with its group
df_files_with_group

Unnamed: 0,file_id,experimental_strategy,data_format,group
0,bc98d560-25d3-4e7c-8212-00d1f5148dde,miRNA-Seq,TXT,Luminal B
1,2ebd1a3f-a299-492c-96ba-6733293d1ba4,miRNA-Seq,TXT,Luminal A
2,d77f7eb5-1943-4147-8a02-c33289a8ed75,miRNA-Seq,TXT,Luminal B
3,fb32f0b1-4fb7-43d5-8091-dc13a1f6d9e8,RNA-Seq,TSV,Luminal B
4,c7637c63-cfdc-448d-b68e-bbf08c8382be,miRNA-Seq,TXT,Luminal A
...,...,...,...,...
1079,297b8ef7-71ca-4ea0-9315-da31f7f8914e,miRNA-Seq,TXT,Luminal A
1080,f1934efb-1c8a-4c72-ba94-d263ef0d0cd4,RNA-Seq,TSV,Luminal B
1081,fd94a0ed-37d8-49a5-af96-e2160a9e6096,RNA-Seq,TSV,Paired Normal
1082,6e7cf482-6c96-41ea-877e-27761eef46ac,miRNA-Seq,TXT,Luminal A


## Basal-like Files

In [6]:
# Download the files related to Basal-like tumor tissue
download_group_files(df_files_with_group, 'Basal-like')

INFO:Downloading file d8f4fee7-bb7b-48d1-8d15-7ee33cc5f339 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like-files/mirna-seq_d8f4fee7-bb7b-48d1-8d15-7ee33cc5f339.txt
INFO:Downloading file 0ed77a2e-a2c9-4557-8ba3-2c8bade9b792 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like-files/rna-seq_0ed77a2e-a2c9-4557-8ba3-2c8bade9b792.tsv
INFO:Downloading file 4d85e3a8-263a-4f3a-8b40-0aa1f20944d3 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like-files/mirna-seq_4d85e3a8-263a-4f3a-8b40-0aa1f20944d3.txt
INFO:Downloading file d0f845cb-f147-4d1b-9759-b842c3894b95 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like-files/mirna-seq_d0f845cb-f147-4d1b-9759-b842c3894b95.txt
INFO:Downloading file d9ea5bde-aa1a-4210-976f-b2f67c6d0191 [at

## HER2-enriched Files

In [7]:
# Download the files related to HER2-enriched tumor tissue
download_group_files(df_files_with_group, 'HER2-enriched')

INFO:Downloading file a58e2ae3-a236-4209-8292-70465216cb85 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched-files/rna-seq_a58e2ae3-a236-4209-8292-70465216cb85.tsv
INFO:Downloading file 09a491b5-1bfb-47d9-963f-69f6c7e6c1e5 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched-files/rna-seq_09a491b5-1bfb-47d9-963f-69f6c7e6c1e5.tsv
INFO:Downloading file be8e6554-07ae-4ec2-8be0-8598556467c9 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched-files/mirna-seq_be8e6554-07ae-4ec2-8be0-8598556467c9.txt
INFO:Downloading file fb397586-1b5b-40e0-a885-8663f9c76f6d [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched-files/rna-seq_fb397586-1b5b-40e0-a885-8663f9c76f6d.tsv
INFO:Downloading file ed76d786-d744-4067-b288-5fe2dc06

## Luminal A Files

In [8]:
# Download the files related to Luminal A tumor tissue
download_group_files(df_files_with_group, 'Luminal A')

INFO:Downloading file 2ebd1a3f-a299-492c-96ba-6733293d1ba4 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a-files/mirna-seq_2ebd1a3f-a299-492c-96ba-6733293d1ba4.txt
INFO:Downloading file c7637c63-cfdc-448d-b68e-bbf08c8382be [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a-files/mirna-seq_c7637c63-cfdc-448d-b68e-bbf08c8382be.txt
INFO:Downloading file ecf9996e-0cba-4394-9f5a-51fceab1ac56 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a-files/rna-seq_ecf9996e-0cba-4394-9f5a-51fceab1ac56.tsv
INFO:Downloading file 36b1e97e-353e-4a4d-8a27-04f7ebabc78c [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a-files/mirna-seq_36b1e97e-353e-4a4d-8a27-04f7ebabc78c.txt
INFO:Downloading file c4afcca9-94e4-4ecd-98d1-0aab86de8685 [attemp

## Luminal B Files

In [9]:
# Download the files related to Luminal B tumor tissue
download_group_files(df_files_with_group, 'Luminal B')

INFO:Downloading file bc98d560-25d3-4e7c-8212-00d1f5148dde [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b-files/mirna-seq_bc98d560-25d3-4e7c-8212-00d1f5148dde.txt
INFO:Downloading file d77f7eb5-1943-4147-8a02-c33289a8ed75 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b-files/mirna-seq_d77f7eb5-1943-4147-8a02-c33289a8ed75.txt
INFO:Downloading file fb32f0b1-4fb7-43d5-8091-dc13a1f6d9e8 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b-files/rna-seq_fb32f0b1-4fb7-43d5-8091-dc13a1f6d9e8.tsv
INFO:Downloading file cf5ab327-47ee-40eb-a06c-3a8fef2a1ebb [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b-files/mirna-seq_cf5ab327-47ee-40eb-a06c-3a8fef2a1ebb.txt
INFO:Downloading file 50c0b143-29b8-4060-a63e-d263ae52b029 [attemp

## Paired Normal Files

In [10]:
# Download the files related to Paired Normal tissue
download_group_files(df_files_with_group, 'Paired Normal')

INFO:Downloading file 456bc30b-59f8-4427-b798-5b113ca635a0 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal-files/rna-seq_456bc30b-59f8-4427-b798-5b113ca635a0.tsv
INFO:Downloading file 7e17b2bf-91b2-4ea2-acc1-59e1537913ce [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal-files/mirna-seq_7e17b2bf-91b2-4ea2-acc1-59e1537913ce.txt
INFO:Downloading file c6a4afd8-8044-475f-b4fd-a1b4cb922976 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal-files/rna-seq_c6a4afd8-8044-475f-b4fd-a1b4cb922976.tsv
INFO:Downloading file 12c76136-cc83-42c0-886b-4b4ea167a0d7 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal-files/mirna-seq_12c76136-cc83-42c0-886b-4b4ea167a0d7.txt
INFO:Downloading file 75182885-7501-49b1-bb0d-8a88da

# Expression Aggregation

## Functions

In [11]:
def process_mir_seq_files(raw_data_dir, interim_data_dir, processed_data_dir):
    """
    Process miRNA-Seq files and aggregate expression data for microRNAs with accession ID.

    Parameters:
    ----------
    raw_data_dir : str
        Path to the directory containing raw miRNA-Seq files.
    interim_data_dir : str
        Path to the directory where aggregated outputs will be stored.
    processed_data_dir : str
        Path to the directory where processed individual files will be stored.

    Returns:
    -------
    dict
        A dictionary with keys:
        - 'mir_reads' : DataFrame of the last processed miRNA-Seq file.
        - 'mir_agg_raw_reads' : DataFrame with aggregated raw read counts.
        - 'mir_agg_normalized_reads' : DataFrame with aggregated normalized read counts.
    """
    # List the miRNA-Seq files contained in the raw data directory
    files = [f for f in os.listdir(raw_data_dir) if f.startswith('mirna-seq_')]
    
    # Initialize the DataFrames of aggregated reads
    primary_key = 'accession_id'
    df_agg_raw_mir_reads = pd.DataFrame(columns=[primary_key])
    df_agg_normalized_mir_reads = pd.DataFrame(columns=[primary_key])
    
    # Process the miRNA-Seq files individually
    for file in files:
        # Retrieve the file id
        file_id = file.replace('mirna-seq_', '')
        file_id = file_id.replace('.txt', '')
        
        # Create a DataFrame for the file
        df_mir_reads = pd.read_csv(f'{raw_data_dir}/{file}', sep='\t')
        
        # Split the isoform coordinates column in other four columns
        df_mir_reads[['genome_assembly', 'chromosome', 'position_range', 'strand']] = \
            df_mir_reads['isoform_coords'].str.split(pat=':', n=-1, expand=True)

        # Split the microRNA region column in other two columns
        df_mir_reads[['region_type', 'accession_id']] = \
            df_mir_reads['miRNA_region'].str.split(pat=',', n=-1, expand=True)
        
        # Flag the microRNAs of interest, i.e., those with isoforms associated with
        # a specific accession identifier (MIMAT ID)
        df_mir_reads['is_mirna_of_interest'] = np.where(
            df_mir_reads['accession_id'].notna(), 1, 0
        )
        
        # Rearrange the DataFrame columns
        df_mir_reads = df_mir_reads \
            .rename(columns={
                'cross-mapped': 'cross_mapped',
                'miRNA_ID': 'mirna_id',
                'reads_per_million_miRNA_mapped': 'reads_per_million',
            }) \
            [[
                'mirna_id',
                'genome_assembly',
                'chromosome',
                'position_range',
                'strand',
                'cross_mapped',
                'region_type',
                'accession_id',
                'read_count',
                'reads_per_million',
                'is_mirna_of_interest',
            ]]
        
        # Store the DataFrame of the processed file in a CSV file
        df_mir_reads.to_csv(f'{processed_data_dir}/{file}', index=False)
        
        # Aggregate reads according to microRNAs of interest accession ID
        df_agg_mir_reads = df_mir_reads \
            .query('is_mirna_of_interest == 1') \
            .groupby(primary_key) \
            .agg(
                raw_reads = pd.NamedAgg(column='read_count', aggfunc='sum'),
                normalized_reads = pd.NamedAgg(column='reads_per_million', aggfunc='sum'),
            ) \
            .reset_index()
        
        # Merge the aggregated raw reads into the respective DataFrame
        df_agg_raw_mir_reads = df_agg_mir_reads \
            .drop(columns=['normalized_reads']) \
            .rename(columns={'raw_reads': file_id}) \
            .merge(
                right=df_agg_raw_mir_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
            
        # Merge the aggregated normalized reads into the respective DataFrame
        df_agg_normalized_mir_reads = df_agg_mir_reads \
            .drop(columns=['raw_reads']) \
            .rename(columns={'normalized_reads': file_id}) \
            .merge(
                right=df_agg_normalized_mir_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
    
    # Fill aggregated NaN reads with zero
    df_agg_raw_mir_reads = df_agg_raw_mir_reads.fillna(0)
    df_agg_normalized_mir_reads = df_agg_normalized_mir_reads.fillna(0)
    
    # Convert sample columns from aggregated raw reads to integers
    sample_cols = df_agg_raw_mir_reads.select_dtypes(include=['number']).columns
    df_agg_raw_mir_reads[sample_cols] = df_agg_raw_mir_reads[sample_cols].astype(int)
    
    # Store the DataFrames of aggregated reads in a CSV file
    df_agg_raw_mir_reads.to_csv(
        os.path.join(interim_data_dir, AGGREGATED_READS_FILES['mir-raw']),
        index=False
    )
    df_agg_normalized_mir_reads.to_csv(
        os.path.join(interim_data_dir, AGGREGATED_READS_FILES['mir-normalized']),
        index=False
    )
    
    # Return a sample of the processing and the aggregated reads DataFrames
    return {
        'mir_reads': df_mir_reads,
        'mir_agg_raw_reads': df_agg_raw_mir_reads,
        'mir_agg_normalized_reads': df_agg_normalized_mir_reads,
    }

In [12]:
def process_rna_seq_files(raw_data_dir, interim_data_dir, processed_data_dir):
    """
    Process RNA-Seq files and aggregate expression data for protein-coding genes.

    Parameters:
    ----------
    raw_data_dir : str
        Path to the directory containing raw RNA-Seq files.
    interim_data_dir : str
        Path to the directory where aggregated outputs will be stored.
    processed_data_dir : str
        Path to the directory where processed individual files will be stored.

    Returns:
    -------
    dict
        A dictionary with keys:
        - 'rna_reads' : DataFrame of the last processed RNA-Seq file.
        - 'rna_agg_raw_reads' : DataFrame with aggregated raw read counts (`unstranded`).
        - 'rna_agg_normalized_reads' : DataFrame with aggregated normalized read counts (`tpm_unstranded`).
    """
    # List the RNA-Seq files contained in the raw data directory
    files = [f for f in os.listdir(raw_data_dir) if f.startswith('rna-seq_')]

    # Initialize the DataFrames of aggregated reads
    primary_key = ['gene_id', 'gene_name']
    df_agg_raw_rna_reads = pd.DataFrame(columns=primary_key)
    df_agg_normalized_rna_reads = pd.DataFrame(columns=primary_key)

    # Process the RNA-Seq files individually 
    for file in files:
        # Retrieve the file id
        file_id = file.replace('rna-seq_', '')
        file_id = file_id.replace('.tsv', '')
        
        # Create a DataFrame for the file
        df_rna_reads = pd.read_csv(f'{raw_data_dir}/{file}', sep='\t', skiprows=1)
        
        # Remove the initial count rows
        df_rna_reads = df_rna_reads.iloc[4:, :].reset_index(drop=True)
        
        # Flag the messenger RNAs of interest, i.e., protein coding genes
        df_rna_reads['is_mrna_of_interest'] = np.where(
            df_rna_reads['gene_type'] == 'protein_coding', 1, 0
        )
        
        # Store the DataFrame of the processed file in a CSV file
        df_rna_reads.to_csv(f'{processed_data_dir}/{file}', index=False)
        
        # Filter the messenger RNA reads and their unstranded related counts
        df_mrna_reads = df_rna_reads \
            .query('is_mrna_of_interest == 1') \
            [['gene_id', 'gene_name', 'unstranded', 'tpm_unstranded']]
        
        # Merge the aggregated raw reads into the respective DataFrame
        df_agg_raw_rna_reads = df_mrna_reads \
            .drop(columns=['tpm_unstranded']) \
            .rename(columns={'unstranded': file_id}) \
            .merge(
                right=df_agg_raw_rna_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
        
        # Merge the aggregated normalized reads into the respective DataFrame
        df_agg_normalized_rna_reads = df_mrna_reads \
            .drop(columns=['unstranded']) \
            .rename(columns={'tpm_unstranded': file_id}) \
            .merge(
                right=df_agg_normalized_rna_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
    
    # Store the DataFrames of aggregated reads in a CSV file
    df_agg_raw_rna_reads.to_csv(
        os.path.join(interim_data_dir, AGGREGATED_READS_FILES['rna-raw']),
        index=False
    )
    df_agg_normalized_rna_reads.to_csv(
        os.path.join(interim_data_dir, AGGREGATED_READS_FILES['rna-normalized']),
        index=False
    )
    
    # Return a sample of the processing and the aggregated reads DataFrames
    return {
        'rna_reads': df_rna_reads,
        'rna_agg_raw_reads': df_agg_raw_rna_reads,
        'rna_agg_normalized_reads': df_agg_normalized_rna_reads,
    }

In [13]:
def process_cohort_files(cohort):
    """
    Process all RNA-Seq and miRNA-Seq files for a given cohort.

    Parameters:
    ----------
    cohort : str
        Name of the cohort used to locate raw and processed data directories.

    Returns:
    -------
    dict
        A dictionary with keys:
        - 'mir_seq_files' : List of DataFrames related to miRNA-Seq processing.
        - 'rna_seq_files' : List of DataFrames related to RNA-Seq processing.
    """
    # Directory paths to read and write the files
    dir_base_name = (cohort.lower()).replace(' ', '-')
    raw_data_dir = BRCA_RAW_FILES_DIRS[dir_base_name]
    interim_data_dir = BRCA_INTERIM_FILES_DIRS[dir_base_name]
    processed_data_dir = BRCA_PROCESSED_FILES_DIRS[dir_base_name]

    # Store samples of the file processing
    processing_results = dict()
    
    # Process the cohort-related miRNA-Seq files
    processing_results['mir_seq_files'] = \
        process_mir_seq_files(raw_data_dir, interim_data_dir, processed_data_dir)
    
    # Process the cohort-related RNA-Seq files
    processing_results['rna_seq_files'] = \
        process_rna_seq_files(raw_data_dir, interim_data_dir, processed_data_dir)
    
    return processing_results

## Basal-like Files

In [14]:
# Process the files related to Basal-like tumor tissue
processing_results = process_cohort_files('Basal-like')
mir_seq_related_results = processing_results['mir_seq_files']
rna_seq_related_results = processing_results['rna_seq_files']

### miRNA-Seq DataFrames

In [15]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_related_results['mir_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175942-94175962,+,N,precursor,,1,0.356970,0
1,hsa-let-7a-1,hg38,chr9,94175960-94175982,+,N,mature,MIMAT0000062,3,1.070911,1
2,hsa-let-7a-1,hg38,chr9,94175960-94175983,+,N,mature,MIMAT0000062,3,1.070911,1
3,hsa-let-7a-1,hg38,chr9,94175960-94175984,+,N,mature,MIMAT0000062,3,1.070911,1
4,hsa-let-7a-1,hg38,chr9,94175961-94175982,+,N,mature,MIMAT0000062,5,1.784852,1
...,...,...,...,...,...,...,...,...,...,...,...
5716,hsa-mir-99b,hg38,chr19,51692656-51692675,+,N,mature,MIMAT0004678,47,16.777607,1
5717,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,43,15.349726,1
5718,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,38,13.564874,1
5719,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,55,19.633370,1


In [16]:
# Print the DataFrame of aggregated microRNA raw reads for this cohort
mir_seq_related_results['mir_agg_raw_reads']

Unnamed: 0,accession_id,0961f723-1c0b-4a2a-8a86-861bf407085b,5359d1b9-dfba-4f3e-b0c4-7fbf51dfb72c,f83b6eb6-444b-4fe0-9f53-3f152537500d,95434759-f95d-4e7a-bc24-c8d8a2dc6d2c,e6a87bcc-c44d-4a20-8f52-e9c4f2923246,6ede932d-8018-4760-8001-179f8bfd3861,5cc5f8c9-a224-4d70-b406-89f0d35cd711,f9b78f39-e96f-4ca4-9149-5c19c68b0d5e,22a58527-ff76-4eb6-a212-a7600e6d8963,...,0bc59ba8-158b-4309-b4d5-bb541f892fbf,3f7848d9-ee82-4648-92d8-e0188275abf4,5fab925b-4cc8-4126-8fb9-8f7655f7bde1,54f6fcc7-e68d-42dc-b1c9-90fa90b083b6,8a37c9f4-a187-4536-9c29-41a3aecba27d,8dbb8398-24b8-4204-bd68-97b22c3353a0,5a79be00-de6f-44ab-a0ac-29ec7bbd2087,118c6c30-50e6-47e2-8a1e-2f0af7b992ef,740e84ea-f42a-4e1c-99bd-39f35627be12,d2d4e49b-9070-4123-a0a1-26c1ae412f90
0,MIMAT0000062,132234,47073,33465,44783,21602,21668,32636,14840,36788,...,47314,23339,205658,29895,38464,12688,19698,28108,83339,15691
1,MIMAT0000063,123894,26174,39633,34232,28656,36921,20699,25232,33829,...,54624,39282,86614,18329,26069,9058,12509,70561,62819,23004
2,MIMAT0000064,685,5601,4763,3762,9756,2013,7859,1524,5926,...,10168,3716,5474,1095,4397,2031,4188,2662,12266,2375
3,MIMAT0000065,1269,569,700,602,502,366,1170,420,754,...,737,506,1615,200,680,194,643,409,650,242
4,MIMAT0000066,4021,1783,1682,2045,1241,297,2031,894,2217,...,1925,695,12396,2254,10772,524,1855,1149,3212,798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,MIMAT0031890,15,1,0,1,8,0,0,10,0,...,0,2,11,0,1,0,0,0,2,1
1915,MIMAT0031893,4,8,29,4,1,8,5,2,7,...,6,2,15,9,6,0,2,6,13,3
1916,MIMAT0032026,2,0,3,0,8,10,0,6,3,...,1,7,13,1,4,1,1,2,6,1
1917,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Print the DataFrame of aggregated microRNA normalized reads for this cohort
mir_seq_related_results['mir_agg_normalized_reads']

Unnamed: 0,accession_id,0961f723-1c0b-4a2a-8a86-861bf407085b,5359d1b9-dfba-4f3e-b0c4-7fbf51dfb72c,f83b6eb6-444b-4fe0-9f53-3f152537500d,95434759-f95d-4e7a-bc24-c8d8a2dc6d2c,e6a87bcc-c44d-4a20-8f52-e9c4f2923246,6ede932d-8018-4760-8001-179f8bfd3861,5cc5f8c9-a224-4d70-b406-89f0d35cd711,f9b78f39-e96f-4ca4-9149-5c19c68b0d5e,22a58527-ff76-4eb6-a212-a7600e6d8963,...,0bc59ba8-158b-4309-b4d5-bb541f892fbf,3f7848d9-ee82-4648-92d8-e0188275abf4,5fab925b-4cc8-4126-8fb9-8f7655f7bde1,54f6fcc7-e68d-42dc-b1c9-90fa90b083b6,8a37c9f4-a187-4536-9c29-41a3aecba27d,8dbb8398-24b8-4204-bd68-97b22c3353a0,5a79be00-de6f-44ab-a0ac-29ec7bbd2087,118c6c30-50e6-47e2-8a1e-2f0af7b992ef,740e84ea-f42a-4e1c-99bd-39f35627be12,d2d4e49b-9070-4123-a0a1-26c1ae412f90
0,MIMAT0000062,47203.619106,34113.267388,20151.395900,39048.494316,13744.344501,19929.142465,26947.180685,16952.773275,16926.217651,...,26489.421077,19392.008713,33565.565703,26376.180175,14471.782204,15647.255996,17824.986102,22048.672127,28051.844350,15553.368251
1,MIMAT0000063,44226.486270,18967.999928,23865.539337,29848.559887,18232.475514,33958.088836,17090.933109,28824.284048,15564.777016,...,30582.029360,32638.797145,14136.322964,16171.567367,9808.259422,11170.621439,11319.562962,55349.948544,21144.827883,22802.223128
2,MIMAT0000064,244.524699,4058.980960,2868.103948,3280.272327,6207.287520,1851.456702,6489.088522,1740.972135,2726.562082,...,5692.700543,3087.566067,893.414832,966.111968,1654.337208,2504.695533,3789.777733,2088.144487,4128.726318,2354.167968
3,MIMAT0000065,452.995390,412.347825,421.514333,524.913328,319.399172,336.628491,966.055933,479.795470,346.916606,...,412.620012,420.427456,263.585122,176.458806,255.844733,239.247137,581.859378,320.830616,218.789506,239.877328
4,MIMAT0000066,1435.377833,1292.119809,1012.838725,1783.135807,789.590387,273.165742,1676.974015,1021.278929,1020.045245,...,1077.738841,577.464589,2023.158605,1988.690755,4052.881602,646.213915,1678.614540,901.306542,1081.156769,791.000438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,MIMAT0031890,5.354555,0.724689,0.000000,0.871949,5.090026,0.000000,0.000000,11.423701,0.000000,...,0.000000,1.661768,1.795316,0.000000,0.376242,0.000000,0.000000,0.000000,0.673198,0.991229
1915,MIMAT0031893,1.427881,5.797511,17.462737,3.487796,0.636253,7.358000,4.128444,2.284740,3.220712,...,3.359185,1.661768,2.448158,7.940646,2.257454,0.000000,1.809827,4.706562,4.375789,2.973687
1916,MIMAT0032026,0.713940,0.000000,1.806490,0.000000,5.090027,9.197500,0.000000,6.854221,1.380305,...,0.559864,5.816190,2.121738,0.882294,1.504969,1.233233,0.904913,1.568854,2.019595,0.991229
1917,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### RNA-Seq DataFrames

In [18]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_related_results['rna_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,2127,1041,1086,34.2301,12.8014,15.0257,1
1,ENSG00000000005.6,TNMD,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
2,ENSG00000000419.13,DPM1,protein_coding,742,336,406,44.8756,16.7826,19.6987,1
3,ENSG00000000457.14,SCYL3,protein_coding,911,903,825,9.6617,3.6133,4.2411,1
4,ENSG00000000460.17,C1orf112,protein_coding,328,604,599,4.0106,1.4999,1.7605,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,270,154,129,10.9073,4.0791,4.7879,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,1,0,1,0.0076,0.0028,0.0033,1


In [19]:
# Print the DataFrame of aggregated messenger RNA raw reads for this cohort
rna_seq_related_results['rna_agg_raw_reads']

Unnamed: 0,gene_id,gene_name,ff1f2f31-8607-4627-b487-c5e39e7c30f5,fe7dfda5-6846-4238-9ccc-472978eb78a1,1d15f05d-2ac2-4b83-987e-6f4b157b0b74,8a7fe670-bcef-4fcb-9759-9892ff5f4f61,6e112ec6-1791-4764-8a62-7ad4dfea2d3b,511ca25f-0b7f-4912-9b79-ed551721a420,c6fd0f95-d74a-49e3-9a70-fec12e63ff1a,c95a37a1-d3cd-421c-bc5f-0f8dc448f64c,...,37939d7d-074b-42b7-8d68-95a8afef4c38,f2a4b38d-bfc0-4363-9a58-b5fa2af26ff3,1f5bbb4e-9b0e-4953-a360-83fd7b6a2267,4f464ced-080f-41f1-9a00-955db0d5fda6,08b0fe4c-ee3c-4510-b75a-6d240cb038cc,59858555-bc6a-4286-8280-0f8341123cac,77f150b1-5f40-442d-91c5-6d3571513513,2cd221ee-4d29-4f40-ab4c-780cc9045c5a,5fbe4a11-de4d-418e-9bc3-75b768d4a665,c7aeba7e-b78e-4586-a4dd-8a04cc440737
0,ENSG00000000003.15,TSPAN6,2127,5469,3345,6421,2162,2300,2581,3875,...,2480,4679,3209,8658,1494,9201,6342,1065,4506,3600
1,ENSG00000000005.6,TNMD,0,9,23,140,29,7,5,5,...,11,2,69,13,78,9,5,254,1,77
2,ENSG00000000419.13,DPM1,742,2153,1792,2061,3375,1418,2766,870,...,1845,2570,2785,6747,3955,4186,1913,1757,3746,2078
3,ENSG00000000457.14,SCYL3,911,985,955,1620,2340,674,2002,649,...,949,1342,1361,2791,812,2054,640,1472,1111,2576
4,ENSG00000000460.17,C1orf112,328,1452,576,1324,1633,691,799,257,...,1024,1121,1113,2377,404,4333,665,262,1404,948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,1,6,7,5,13,8,5,6,...,8,14,6,12,8,11,2,9,7,17


In [20]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this cohort
rna_seq_related_results['rna_agg_normalized_reads']

Unnamed: 0,gene_id,gene_name,ff1f2f31-8607-4627-b487-c5e39e7c30f5,fe7dfda5-6846-4238-9ccc-472978eb78a1,1d15f05d-2ac2-4b83-987e-6f4b157b0b74,8a7fe670-bcef-4fcb-9759-9892ff5f4f61,6e112ec6-1791-4764-8a62-7ad4dfea2d3b,511ca25f-0b7f-4912-9b79-ed551721a420,c6fd0f95-d74a-49e3-9a70-fec12e63ff1a,c95a37a1-d3cd-421c-bc5f-0f8dc448f64c,...,37939d7d-074b-42b7-8d68-95a8afef4c38,f2a4b38d-bfc0-4363-9a58-b5fa2af26ff3,1f5bbb4e-9b0e-4953-a360-83fd7b6a2267,4f464ced-080f-41f1-9a00-955db0d5fda6,08b0fe4c-ee3c-4510-b75a-6d240cb038cc,59858555-bc6a-4286-8280-0f8341123cac,77f150b1-5f40-442d-91c5-6d3571513513,2cd221ee-4d29-4f40-ab4c-780cc9045c5a,5fbe4a11-de4d-418e-9bc3-75b768d4a665,c7aeba7e-b78e-4586-a4dd-8a04cc440737
0,ENSG00000000003.15,TSPAN6,34.2301,75.6881,38.2406,79.6883,24.8666,61.8915,31.8307,33.3467,...,28.8647,60.8935,46.5228,73.3539,16.8126,120.6754,79.1792,11.9260,61.3405,65.0686
1,ENSG00000000005.6,TNMD,0.0000,0.3828,0.8081,5.3396,1.0251,0.5789,0.1895,0.1322,...,0.3935,0.0800,3.0742,0.3385,2.6975,0.3628,0.1918,8.7411,0.0418,4.2771
2,ENSG00000000419.13,DPM1,44.8756,111.9771,76.9897,96.1248,145.8815,143.3986,128.1966,28.1363,...,80.7007,125.6946,151.7356,214.8237,167.2614,206.3235,89.7565,73.9406,191.6415,141.1497
3,ENSG00000000457.14,SCYL3,9.6617,8.9836,7.1949,13.2496,17.7367,11.9525,16.2711,3.6806,...,7.2791,11.5097,13.0032,15.5833,6.0219,17.7533,5.2658,10.8630,9.9670,30.6838
4,ENSG00000000460.17,C1orf112,4.0106,15.2681,5.0032,12.4847,14.2707,14.1280,7.4869,1.6804,...,9.0555,11.0847,12.2600,15.3015,3.4543,43.1789,6.3082,2.2292,14.5218,13.0189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0157,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0076,0.0392,0.0377,0.0293,0.0705,0.1015,0.0291,0.0243,...,0.0439,0.0859,0.0410,0.0479,0.0424,0.0680,0.0118,0.0475,0.0449,0.1449


## HER2-enriched Files

In [21]:
# Process the files related to HER2-enriched tumor tissue
processing_results = process_cohort_files('HER2-enriched')
mir_seq_related_results = processing_results['mir_seq_files']
rna_seq_related_results = processing_results['rna_seq_files']

### miRNA-Seq DataFrames

In [22]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_related_results['mir_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175961-94175984,+,N,mature,MIMAT0000062,6,9.691159,1
1,hsa-let-7a-1,hg38,chr9,94175962-94175981,+,N,mature,MIMAT0000062,24,38.764636,1
2,hsa-let-7a-1,hg38,chr9,94175962-94175982,+,N,mature,MIMAT0000062,568,917.429711,1
3,hsa-let-7a-1,hg38,chr9,94175962-94175983,+,N,mature,MIMAT0000062,775,1251.774693,1
4,hsa-let-7a-1,hg38,chr9,94175962-94175984,+,N,mature,MIMAT0000062,3653,5900.300587,1
...,...,...,...,...,...,...,...,...,...,...,...
3047,hsa-mir-99b,hg38,chr19,51692656-51692675,+,N,mature,MIMAT0004678,7,11.306352,1
3048,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,10,16.151932,1
3049,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,1,1.615193,1
3050,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,47,75.914078,1


In [23]:
# Print the DataFrame of aggregated microRNA raw reads for this cohort
mir_seq_related_results['mir_agg_raw_reads']

Unnamed: 0,accession_id,f3a7612e-35c3-43be-8f96-795874f8225c,3abbb570-87ad-44c5-b66d-141f63b0ca8b,dc300e50-f729-4d0e-8a33-606e95d91c95,a00a04b4-8de5-4f18-9dc2-531e21aa5167,0d003664-43ff-4eb5-b6c0-28ba7f26ee64,557f6cb1-95e3-4c8c-b253-538a415b0d07,ed76d786-d744-4067-b288-5fe2dc06a310,e4b1e8c9-5921-436d-8f6a-9af5b5baea8e,37fb952c-2a89-4903-98b9-6f6d36743218,...,3be37b85-90eb-4f47-909a-8be730172d92,3d4b6861-2763-451d-ae0a-6a5adfc591ec,475e779f-c6b7-4f7c-9616-f2c47b9b97a5,eaf6af76-2b28-4128-9371-a3d928d72370,d0982681-8799-412d-a868-81078e4a6560,a44df6fa-36a7-4500-90fd-d62baf48ef52,646eb67c-2e86-4f89-867f-78c31eff5fe0,91b36078-362b-4909-ac90-1b2d1ea4a545,aafbf8fb-290d-408e-b104-241b7d492895,ff79f645-81f1-4a3b-ae72-adc169f30143
0,MIMAT0000062,15868,28998,49811,10962,20083,81120,21828,76258,25178,...,130734,54800,15681,25558,81617,11903,18136,45392,36868,31019
1,MIMAT0000063,20715,89908,67531,6183,38497,50804,47014,32421,49145,...,95620,63216,17932,19561,60530,14327,16079,40233,26816,40579
2,MIMAT0000064,915,3128,6580,601,2147,5503,2754,4303,2739,...,3829,1070,3705,1883,7084,912,2828,4304,3829,5011
3,MIMAT0000065,192,982,512,138,121,1335,561,737,557,...,717,765,205,194,669,157,418,1269,407,593
4,MIMAT0000066,1364,1527,725,1017,919,2217,1151,3200,1074,...,3627,558,1013,881,2897,1609,441,782,1281,1689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749,MIMAT0031892,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1750,MIMAT0031893,4,7,4,6,1,12,8,33,2,...,14,3,3,2,14,7,11,0,1,4
1751,MIMAT0032026,0,2,1,5,1,5,3,2,2,...,1,6,2,2,4,1,2,3,2,6
1752,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Print the DataFrame of aggregated microRNA normalized reads for this cohort
mir_seq_related_results['mir_agg_normalized_reads']

Unnamed: 0,accession_id,f3a7612e-35c3-43be-8f96-795874f8225c,3abbb570-87ad-44c5-b66d-141f63b0ca8b,dc300e50-f729-4d0e-8a33-606e95d91c95,a00a04b4-8de5-4f18-9dc2-531e21aa5167,0d003664-43ff-4eb5-b6c0-28ba7f26ee64,557f6cb1-95e3-4c8c-b253-538a415b0d07,ed76d786-d744-4067-b288-5fe2dc06a310,e4b1e8c9-5921-436d-8f6a-9af5b5baea8e,37fb952c-2a89-4903-98b9-6f6d36743218,...,3be37b85-90eb-4f47-909a-8be730172d92,3d4b6861-2763-451d-ae0a-6a5adfc591ec,475e779f-c6b7-4f7c-9616-f2c47b9b97a5,eaf6af76-2b28-4128-9371-a3d928d72370,d0982681-8799-412d-a868-81078e4a6560,a44df6fa-36a7-4500-90fd-d62baf48ef52,646eb67c-2e86-4f89-867f-78c31eff5fe0,91b36078-362b-4909-ac90-1b2d1ea4a545,aafbf8fb-290d-408e-b104-241b7d492895,ff79f645-81f1-4a3b-ae72-adc169f30143
0,MIMAT0000062,25629.884947,16161.456276,16514.012091,10485.607394,24718.694540,31004.066221,15904.149594,22754.779044,17622.409534,...,70230.723396,22059.832868,15431.883104,22471.949627,25632.674849,8118.425479,16135.331803,15115.208847,19471.623333,16840.250251
1,MIMAT0000063,33458.726161,50108.428557,22388.784620,5914.295798,47383.139152,19417.290191,34254.979335,9674.167844,34397.224425,...,51367.370171,25447.707932,17647.122499,17199.069044,19010.081339,9771.711485,14305.249233,13397.299030,14162.717028,22030.385091
2,MIMAT0000064,1477.901734,1743.328340,2181.490022,574.881413,2642.585131,2103.246751,2006.598312,1283.980883,1917.061709,...,2056.951058,430.730313,3646.140355,1655.633506,2224.804492,622.028400,2516.029904,1433.200981,2022.264449,2720.477578
3,MIMAT0000065,310.117084,547.298091,169.745117,132.002719,148.930044,510.237035,408.751507,219.914922,389.851541,...,385.174693,307.952049,201.743258,170.575093,210.106462,107.081644,371.888437,422.567854,214.954722,321.940370
4,MIMAT0000066,2203.123459,851.042959,240.361743,972.802654,1131.129825,847.337460,838.632773,954.854480,751.706562,...,1948.436013,224.623846,996.906930,774.621942,909.833232,1097.416332,392.351198,260.400365,676.552823,916.960014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749,MIMAT0031892,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1750,MIMAT0031893,6.460772,3.901310,1.326134,5.739247,1.230827,4.586400,5.828898,9.846936,1.399826,...,7.520844,1.207656,2.952340,1.758506,4.396845,4.774341,9.786537,0.000000,0.528144,2.171604
1751,MIMAT0032026,0.000000,1.114660,0.331533,4.782707,1.230827,1.911000,2.185837,0.596784,1.399826,...,0.537203,2.415310,1.968226,1.758506,1.256242,0.682049,1.779370,0.998979,1.056289,3.257407
1752,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### RNA-Seq DataFrames

In [25]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_related_results['rna_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,1637,853,784,24.1689,6.4607,6.9242,1
1,ENSG00000000005.6,TNMD,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
2,ENSG00000000419.13,DPM1,protein_coding,2061,948,1113,114.3542,30.5685,32.7614,1
3,ENSG00000000457.14,SCYL3,protein_coding,1944,1516,1486,18.9147,5.0562,5.4189,1
4,ENSG00000000460.17,C1orf112,protein_coding,656,892,899,7.3589,1.9671,2.1082,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,466,254,243,17.2707,4.6167,4.9479,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,1,0,1,0.0070,0.0019,0.0020,1


In [26]:
# Print the DataFrame of aggregated messenger RNA raw reads for this cohort
rna_seq_related_results['rna_agg_raw_reads']

Unnamed: 0,gene_id,gene_name,0a7421d4-5722-427a-9643-da1074e5c25c,111b2865-e7dc-45c9-b206-20b5713714bc,09a491b5-1bfb-47d9-963f-69f6c7e6c1e5,f0a63361-78bb-4a7f-9d78-31f7b2980ba2,7b188e2a-4a56-49f9-b527-cd61a646f6c7,fd371070-c312-4eca-b3da-d41b1c8a86a7,a4589532-53e0-4025-b8d3-e11e79e1fc9c,0c279a15-1250-4923-bcde-96cc3fd32d42,...,4c8d0d1a-dbd3-4124-8458-dfb61f61f11c,be6f8b24-e668-41c6-89f7-f4cdc8533cae,5c0579f7-6184-4afa-805c-7aeec4a4c5d2,0710056c-2f04-4182-90ca-45492dd6444c,cee943a8-ea26-4b2f-b8a4-66300ccacfb5,1936440a-9cfc-4b04-af40-3fbc22fe87f6,0a74ea3f-dadc-4c9f-96ea-62a77c1e602c,788a4858-c6c6-4a8d-9aef-dbd32d74776b,d3badb09-df1e-488e-ade9-97f1925b5649,eb166054-ff70-4a86-883b-9c25a7d2b0e5
0,ENSG00000000003.15,TSPAN6,1637,735,3834,745,8421,11016,1075,2700,...,1548,585,1379,2951,1349,3606,6278,3926,1412,2825
1,ENSG00000000005.6,TNMD,0,0,0,0,0,2,0,0,...,24,6,0,5,0,2,18,8,0,19
2,ENSG00000000419.13,DPM1,2061,1329,3161,1537,3904,3511,2191,2527,...,3668,793,2176,4550,878,1816,2633,1212,3381,9021
3,ENSG00000000457.14,SCYL3,1944,755,1411,1798,2519,2828,2598,1093,...,1009,620,723,3296,378,2893,1425,856,1237,3057
4,ENSG00000000460.17,C1orf112,656,597,637,497,1186,3257,381,575,...,650,184,383,1247,243,447,480,477,1087,530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,1,3,0,5,7,2,4,3,...,12,0,0,31,0,0,9,4,6,10


In [27]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this cohort
rna_seq_related_results['rna_agg_normalized_reads']

Unnamed: 0,gene_id,gene_name,0a7421d4-5722-427a-9643-da1074e5c25c,111b2865-e7dc-45c9-b206-20b5713714bc,09a491b5-1bfb-47d9-963f-69f6c7e6c1e5,f0a63361-78bb-4a7f-9d78-31f7b2980ba2,7b188e2a-4a56-49f9-b527-cd61a646f6c7,fd371070-c312-4eca-b3da-d41b1c8a86a7,a4589532-53e0-4025-b8d3-e11e79e1fc9c,0c279a15-1250-4923-bcde-96cc3fd32d42,...,4c8d0d1a-dbd3-4124-8458-dfb61f61f11c,be6f8b24-e668-41c6-89f7-f4cdc8533cae,5c0579f7-6184-4afa-805c-7aeec4a4c5d2,0710056c-2f04-4182-90ca-45492dd6444c,cee943a8-ea26-4b2f-b8a4-66300ccacfb5,1936440a-9cfc-4b04-af40-3fbc22fe87f6,0a74ea3f-dadc-4c9f-96ea-62a77c1e602c,788a4858-c6c6-4a8d-9aef-dbd32d74776b,d3badb09-df1e-488e-ade9-97f1925b5649,eb166054-ff70-4a86-883b-9c25a7d2b0e5
0,ENSG00000000003.15,TSPAN6,24.1689,23.5594,52.3390,14.5023,79.1163,127.9853,14.4614,58.5780,...,22.3424,17.3579,11.2357,35.1888,37.8382,55.1557,90.3164,53.7060,21.0121,37.1006
1,ENSG00000000005.6,TNMD,0.0000,0.0000,0.0000,0.0000,0.0000,0.0714,0.0000,0.0000,...,1.0645,0.5471,0.0000,0.1832,0.0000,0.0940,0.7958,0.3363,0.0000,0.7668
2,ENSG00000000419.13,DPM1,114.3542,160.0912,162.1673,112.4400,137.8408,153.2966,110.7670,206.0353,...,198.9549,88.4263,66.6285,203.8975,92.5506,104.3869,142.3515,62.3076,189.0799,445.2281
3,ENSG00000000457.14,SCYL3,18.9147,15.9485,12.6939,23.0657,15.5964,21.6526,23.0323,15.6274,...,9.5972,12.1235,3.8821,25.9010,6.9872,29.1614,13.5100,7.7169,12.1311,26.4577
4,ENSG00000000460.17,C1orf112,7.3589,14.5395,6.6071,7.3508,8.4661,28.7510,3.8943,9.4784,...,7.1281,4.1482,2.3710,11.2980,5.1787,5.1948,5.2467,4.9578,12.2903,5.2886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0164,0.0000,0.0000,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0070,0.0453,0.0000,0.0459,0.0310,0.0110,0.0254,0.0307,...,0.0817,0.0000,0.0000,0.1743,0.0000,0.0000,0.0611,0.0258,0.0421,0.0619


## Luminal A Files

In [28]:
# Process the files related to Luminal A tumor tissue
processing_results = process_cohort_files('Luminal A')
mir_seq_related_results = processing_results['mir_seq_files']
rna_seq_related_results = processing_results['rna_seq_files']

### miRNA-Seq DataFrames

In [29]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_related_results['mir_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175960-94175982,+,N,mature,MIMAT0000062,1,0.792750,1
1,hsa-let-7a-1,hg38,chr9,94175960-94175984,+,N,mature,MIMAT0000062,1,0.792750,1
2,hsa-let-7a-1,hg38,chr9,94175961-94175980,+,N,mature,MIMAT0000062,1,0.792750,1
3,hsa-let-7a-1,hg38,chr9,94175961-94175982,+,N,mature,MIMAT0000062,1,0.792750,1
4,hsa-let-7a-1,hg38,chr9,94175961-94175983,+,N,mature,MIMAT0000062,1,0.792750,1
...,...,...,...,...,...,...,...,...,...,...,...
3243,hsa-mir-99b,hg38,chr19,51692656-51692675,+,N,mature,MIMAT0004678,13,10.305756,1
3244,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,9,7.134754,1
3245,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,8,6.342004,1
3246,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,26,20.611512,1


In [30]:
# Print the DataFrame of aggregated microRNA raw reads for this cohort
mir_seq_related_results['mir_agg_raw_reads']

Unnamed: 0,accession_id,cfe94e8f-e277-45cd-9318-877c37903437,87ece8e3-9cff-4ac2-812c-084dfa22717d,d19d100c-9a44-432c-9c16-5783aa4a07b1,4b1e2d17-a537-461c-bbcc-d33f30015938,aea5c686-fcb7-4ca4-9fba-3416782f09b9,7a4675e7-4174-4561-ae9c-b7636fbea718,3a6a7205-8a41-4691-aba5-f1e252dece0e,539908ab-5742-465f-91a1-819917da850e,f81bad1a-e3a0-4ef0-a8ca-0e800a28bf8e,...,96c28baa-65ae-4b13-bd2a-9085ca353f94,da324620-e93a-4cb4-a195-e15702ddddbf,d52be4fa-bd0b-44cf-95ad-30ac7c049afd,297b8ef7-71ca-4ea0-9315-da31f7f8914e,8b2a6497-fe35-4377-b3e4-ff8aa1544521,a1ef8fb3-63d5-45e0-acf5-5a4a52a6a28a,b8e351ec-ce57-4cc1-a87a-1168c9f20972,34ada12d-9f97-4bd1-9185-ddd7431c7074,c0eda172-ffc4-4edd-888d-eb7e09d3d9e7,33cba3fa-57ce-4e2a-89d8-767f1f74b644
0,MIMAT0000062,30350,280078,60148,68096,71727,101623,13481,33405,140687,...,46942,34432,50148,43475,34056,56036,30612,37563,56115,79583
1,MIMAT0000063,57654,374426,58401,67105,303174,82306,15372,65094,159348,...,16010,58362,89660,45452,69684,84267,68636,14201,77992,165926
2,MIMAT0000064,8914,2636,4870,500,7946,1900,1767,1782,7288,...,1361,3174,3309,6773,3175,5823,6209,1936,4104,2408
3,MIMAT0000065,126,837,255,639,248,368,251,302,718,...,757,763,425,225,434,213,290,202,717,434
4,MIMAT0000066,1089,3703,1123,1666,1107,2425,1470,557,4110,...,1725,1923,1887,875,1237,2897,977,1317,1793,1431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,MIMAT0031893,3,6,1,3,4,3,2,4,8,...,3,3,1,4,5,0,9,0,10,1
1989,MIMAT0032026,0,0,0,0,0,2,1,0,2,...,1,2,0,0,1,0,0,1,1,1
1990,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1991,MIMAT0032110,0,14,4,5,3,12,11,22,7,...,4,22,3,4,3,0,2,2,3,32


In [31]:
# Print the DataFrame of aggregated microRNA normalized reads for this cohort
mir_seq_related_results['mir_agg_normalized_reads']

Unnamed: 0,accession_id,cfe94e8f-e277-45cd-9318-877c37903437,87ece8e3-9cff-4ac2-812c-084dfa22717d,d19d100c-9a44-432c-9c16-5783aa4a07b1,4b1e2d17-a537-461c-bbcc-d33f30015938,aea5c686-fcb7-4ca4-9fba-3416782f09b9,7a4675e7-4174-4561-ae9c-b7636fbea718,3a6a7205-8a41-4691-aba5-f1e252dece0e,539908ab-5742-465f-91a1-819917da850e,f81bad1a-e3a0-4ef0-a8ca-0e800a28bf8e,...,96c28baa-65ae-4b13-bd2a-9085ca353f94,da324620-e93a-4cb4-a195-e15702ddddbf,d52be4fa-bd0b-44cf-95ad-30ac7c049afd,297b8ef7-71ca-4ea0-9315-da31f7f8914e,8b2a6497-fe35-4377-b3e4-ff8aa1544521,a1ef8fb3-63d5-45e0-acf5-5a4a52a6a28a,b8e351ec-ce57-4cc1-a87a-1168c9f20972,34ada12d-9f97-4bd1-9185-ddd7431c7074,c0eda172-ffc4-4edd-888d-eb7e09d3d9e7,33cba3fa-57ce-4e2a-89d8-767f1f74b644
0,MIMAT0000062,24059.976318,57000.303035,28573.681690,23706.195791,36194.882242,56591.784919,13764.296764,21361.058484,39324.759149,...,36350.626887,20491.749621,22189.076166,31573.657225,18674.676993,20722.054665,20391.971906,32126.545902,20827.031527,23455.337957
1,MIMAT0000063,45705.234766,76201.613351,27743.758473,23361.199901,152987.678692,45834.539907,15695.035218,41624.808885,44540.872456,...,12397.714977,34733.372771,39672.022199,33009.450675,38211.363389,31161.849174,45721.396313,12145.703972,28946.660305,48903.037144
2,MIMAT0000064,7066.577560,536.467697,2313.523805,174.064525,4009.710907,1058.071415,1804.132659,1139.512235,2037.138073,...,1053.921929,1888.964142,1464.139208,4918.881667,1741.017717,2153.339359,4136.082373,1655.804724,1523.195889,709.705010
3,MIMAT0000065,99.886557,170.342742,121.139336,222.454464,125.145770,204.931728,256.274645,193.115990,200.694992,...,586.200512,454.089366,188.050521,163.405933,237.984784,78.767180,193.181492,172.764748,266.113900,127.911950
4,MIMAT0000066,863.305245,753.619072,533.488137,579.982996,558.614393,1350.433252,1500.891348,356.177507,1148.825122,...,1335.793775,1144.448029,834.944300,635.467512,678.311470,1071.307593,650.821788,1126.391953,665.470329,421.755760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,MIMAT0031893,2.378251,1.221095,0.475056,1.044387,2.018480,1.670640,2.042030,2.557828,2.236154,...,2.323119,1.785411,0.442472,2.904996,2.741760,0.000000,5.995287,0.000000,3.711490,0.294728
1989,MIMAT0032026,0.000000,0.000000,0.000000,0.000000,0.000000,1.113760,1.021015,0.000000,0.559038,...,0.774373,1.190274,0.000000,0.000000,0.548352,0.000000,0.000000,0.855271,0.371149,0.294728
1990,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1991,MIMAT0032110,0.000000,2.849222,1.900225,1.740645,1.513860,6.682557,11.231162,14.068053,1.956636,...,3.097492,13.093010,1.327416,2.904995,1.645056,0.000000,1.332286,1.710542,1.113447,9.431296


### RNA-Seq DataFrames

In [32]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_related_results['rna_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,2610,1334,1276,41.5975,11.6202,12.6096,1
1,ENSG00000000005.6,TNMD,protein_coding,1,0,1,0.0490,0.0137,0.0148,1
2,ENSG00000000419.13,DPM1,protein_coding,1406,744,662,84.2128,23.5247,25.5278,1
3,ENSG00000000457.14,SCYL3,protein_coding,880,687,667,9.2428,2.5820,2.8018,1
4,ENSG00000000460.17,C1orf112,protein_coding,243,387,383,2.9426,0.8220,0.8920,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,164,93,83,6.5612,1.8329,1.9889,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,0,0,0,0.0000,0.0000,0.0000,1


In [33]:
# Print the DataFrame of aggregated messenger RNA raw reads for this cohort
rna_seq_related_results['rna_agg_raw_reads']

Unnamed: 0,gene_id,gene_name,8f79405a-c78e-4a9e-a673-969658a0f90b,1a4285b8-765b-4954-b9f2-dc8609572889,987d401a-3e68-414e-8067-afe277b02fad,196aed5b-6812-47c5-ad7b-4dd1743ed0b5,892928f7-8239-4d15-812b-046f9192d7de,c64a0adb-56a8-4966-9dd9-1acd789c1bb3,65bc2a34-2355-4193-b20e-439f2d7f6df2,03891509-3109-450d-8564-77b024a6128e,...,1dacccf9-58ae-490f-9f3f-31c82f93b3b8,5b6828f2-4da6-49a4-a19c-569e089ff0b0,4c75e72b-db9f-425b-9f14-946ee801071e,072cae2e-f4c1-4a3f-85e1-6f7e35a65108,d914654b-ec4f-4be5-a525-2997a4b39279,81f86e31-55db-4483-b6e0-55451ffbd1db,03d18286-1038-4c0c-9c05-d1269e280250,bc0a4326-63d2-4ec4-9af6-7b421aa8aa49,8d87fbd5-8ca7-4ff6-b8b1-79f098dbca9f,e5b0c2dc-c652-40a0-bb80-d7e87830b406
0,ENSG00000000003.15,TSPAN6,2610,2436,5269,2177,4278,1847,3324,655,...,1004,3100,787,3498,1134,1890,1825,1236,2969,2898
1,ENSG00000000005.6,TNMD,1,59,57,33,4,86,32,41,...,101,7,114,1,20,5,63,4,760,27
2,ENSG00000000419.13,DPM1,1406,1254,1896,1502,2609,1124,1658,2275,...,2133,1569,1989,2713,1312,6022,1737,2412,1581,1312
3,ENSG00000000457.14,SCYL3,880,2138,1262,1027,1005,718,961,3068,...,2901,2404,2530,3231,964,1725,1596,1480,1535,2001
4,ENSG00000000460.17,C1orf112,243,931,334,457,495,298,702,800,...,1361,410,764,905,467,728,483,328,423,767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,0,21,3,5,5,6,9,8,...,9,24,13,3,8,7,7,8,11,10


In [34]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this cohort
rna_seq_related_results['rna_agg_normalized_reads']

Unnamed: 0,gene_id,gene_name,8f79405a-c78e-4a9e-a673-969658a0f90b,1a4285b8-765b-4954-b9f2-dc8609572889,987d401a-3e68-414e-8067-afe277b02fad,196aed5b-6812-47c5-ad7b-4dd1743ed0b5,892928f7-8239-4d15-812b-046f9192d7de,c64a0adb-56a8-4966-9dd9-1acd789c1bb3,65bc2a34-2355-4193-b20e-439f2d7f6df2,03891509-3109-450d-8564-77b024a6128e,...,1dacccf9-58ae-490f-9f3f-31c82f93b3b8,5b6828f2-4da6-49a4-a19c-569e089ff0b0,4c75e72b-db9f-425b-9f14-946ee801071e,072cae2e-f4c1-4a3f-85e1-6f7e35a65108,d914654b-ec4f-4be5-a525-2997a4b39279,81f86e31-55db-4483-b6e0-55451ffbd1db,03d18286-1038-4c0c-9c05-d1269e280250,bc0a4326-63d2-4ec4-9af6-7b421aa8aa49,8d87fbd5-8ca7-4ff6-b8b1-79f098dbca9f,e5b0c2dc-c652-40a0-bb80-d7e87830b406
0,ENSG00000000003.15,TSPAN6,41.5975,31.8464,79.8514,37.7646,59.6945,36.0883,51.1375,8.4592,...,10.9123,31.4603,9.2578,47.8861,19.7839,19.5323,22.9096,17.6235,45.6299,50.2267
1,ENSG00000000005.6,TNMD,0.0490,2.3704,2.6547,1.7592,0.1715,5.1640,1.5129,1.6273,...,3.3736,0.2183,4.1212,0.0421,1.0723,0.1588,2.4304,0.1753,35.8955,1.4381
2,ENSG00000000419.13,DPM1,84.2128,61.6094,107.9837,97.9179,136.8148,82.5337,95.8581,110.4165,...,87.1243,59.8398,87.9295,139.5742,86.0198,233.8833,81.9446,129.2460,91.3139,85.4547
3,ENSG00000000457.14,SCYL3,9.2428,18.4199,12.6040,11.7406,9.2418,9.2453,9.7431,26.1118,...,20.7791,16.0780,19.6133,29.1489,11.0833,11.7484,13.2033,13.9069,15.5469,22.8549
4,ENSG00000000460.17,C1orf112,2.9426,9.2477,3.8459,6.0234,5.2480,4.4240,8.2057,7.8501,...,11.2393,3.1614,6.8285,9.4132,6.1903,5.7164,4.6068,3.5534,4.9394,10.1002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0143,0.0000,0.0000,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0000,0.1294,0.0214,0.0409,0.0329,0.0553,0.0653,0.0487,...,0.0461,0.1148,0.0721,0.0194,0.0658,0.0341,0.0414,0.0538,0.0797,0.0817


## Luminal B Files

In [35]:
# Process the files related to Luminal B tumor tissue
processing_results = process_cohort_files('Luminal B')
mir_seq_related_results = processing_results['mir_seq_files']
rna_seq_related_results = processing_results['rna_seq_files']

### miRNA-Seq DataFrames

In [36]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_related_results['mir_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175959-94175984,+,N,mature,MIMAT0000062,1,0.835607,1
1,hsa-let-7a-1,hg38,chr9,94175960-94175984,+,N,mature,MIMAT0000062,1,0.835607,1
2,hsa-let-7a-1,hg38,chr9,94175961-94175982,+,N,mature,MIMAT0000062,3,2.506821,1
3,hsa-let-7a-1,hg38,chr9,94175961-94175983,+,N,mature,MIMAT0000062,2,1.671214,1
4,hsa-let-7a-1,hg38,chr9,94175961-94175984,+,N,mature,MIMAT0000062,6,5.013641,1
...,...,...,...,...,...,...,...,...,...,...,...
4144,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,8,6.684855,1
4145,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,6,5.013641,1
4146,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,24,20.054565,1
4147,hsa-mir-99b,hg38,chr19,51692656-51692679,+,N,mature,MIMAT0004678,3,2.506821,1


In [37]:
# Print the DataFrame of aggregated microRNA raw reads for this cohort
mir_seq_related_results['mir_agg_raw_reads']

Unnamed: 0,accession_id,7262f244-82d5-4d3c-8762-782c5381a6e5,ed716c0b-4b09-4902-86b8-6955488d60c3,6a813732-f636-4369-8a31-04ef69340706,d7e60e41-5da6-49df-a8e4-c732699a6f61,ce2f555b-bfd9-4a8a-84b1-54feaf484cd2,f5b1694d-7bd1-43be-8ec6-2bcaad95d802,42f12ac2-0c43-4fea-bf8f-edc15cd33d47,0397db08-993e-4a47-802e-36f88161ec6f,7b9a5582-5ce8-45b7-a699-4b0d6e5559e2,...,571f1dd9-cf46-472e-96b4-0835b2d711a8,cf5ab327-47ee-40eb-a06c-3a8fef2a1ebb,87ca2702-d747-4c73-92ff-a8b7f30fb306,50c0b143-29b8-4060-a63e-d263ae52b029,cb0cbce4-ed6f-4819-a83d-00f7bfe400ac,f88848be-c988-4cef-805f-193477175d27,15a656bc-0f5f-4f06-80ad-0584bf873805,1c5f84e6-1571-4cb9-b7b9-bfa6b13e121e,4941a6e3-84e3-4afa-8ce4-a6bd1420a438,e841eb85-0fa6-43bc-8849-716e5fac921c
0,MIMAT0000062,35520,52391,12389,26478,30912,42844,10070,44201,121623,...,21301,35814,55125,49487,18098,41477,25753,36546,141376,23366
1,MIMAT0000063,77667,30743,11315,28568,25114,65036,13291,108900,104719,...,23921,66517,25669,15679,15973,55674,27100,58168,73359,29828
2,MIMAT0000064,1087,3026,1659,1546,626,1462,490,4832,10316,...,2234,7658,966,2129,517,5782,1403,839,2501,814
3,MIMAT0000065,322,246,435,304,252,507,96,213,545,...,250,183,291,147,186,328,287,251,635,370
4,MIMAT0000066,1654,2604,1680,2146,1628,645,1591,1332,3567,...,1643,1575,1549,1163,1129,3713,1064,950,5675,667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,MIMAT0031892,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1864,MIMAT0031893,4,1,10,5,1,4,4,1,6,...,3,5,2,0,0,2,1,3,0,2
1865,MIMAT0032026,3,1,2,4,0,2,1,2,0,...,1,0,1,0,0,0,0,0,5,8
1866,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Print the DataFrame of aggregated microRNA normalized reads for this cohort
mir_seq_related_results['mir_agg_normalized_reads']

Unnamed: 0,accession_id,7262f244-82d5-4d3c-8762-782c5381a6e5,ed716c0b-4b09-4902-86b8-6955488d60c3,6a813732-f636-4369-8a31-04ef69340706,d7e60e41-5da6-49df-a8e4-c732699a6f61,ce2f555b-bfd9-4a8a-84b1-54feaf484cd2,f5b1694d-7bd1-43be-8ec6-2bcaad95d802,42f12ac2-0c43-4fea-bf8f-edc15cd33d47,0397db08-993e-4a47-802e-36f88161ec6f,7b9a5582-5ce8-45b7-a699-4b0d6e5559e2,...,571f1dd9-cf46-472e-96b4-0835b2d711a8,cf5ab327-47ee-40eb-a06c-3a8fef2a1ebb,87ca2702-d747-4c73-92ff-a8b7f30fb306,50c0b143-29b8-4060-a63e-d263ae52b029,cb0cbce4-ed6f-4819-a83d-00f7bfe400ac,f88848be-c988-4cef-805f-193477175d27,15a656bc-0f5f-4f06-80ad-0584bf873805,1c5f84e6-1571-4cb9-b7b9-bfa6b13e121e,4941a6e3-84e3-4afa-8ce4-a6bd1420a438,e841eb85-0fa6-43bc-8849-716e5fac921c
0,MIMAT0000062,29680.756395,36552.149485,9129.213494,21808.639436,24950.421218,34548.018792,13442.672413,24241.084739,30808.798409,...,24870.515929,23405.609655,28555.385023,29811.876795,15005.886935,24087.114784,21406.478716,25277.949547,80276.552538,17816.010013
1,MIMAT0000063,64899.079582,21448.774247,8337.803743,23530.070685,20270.602945,52442.931342,17742.458689,59723.855320,26526.779967,...,27929.562532,43471.015183,13296.837698,9445.317282,13243.951379,32331.799035,22526.135719,40233.343432,41654.931654,22743.128759
2,MIMAT0000064,908.304680,2111.179485,1222.484880,1273.364927,505.271860,1178.909614,654.112162,2650.006142,2613.186361,...,2608.362639,5004.751183,500.399129,1282.548661,428.668556,3357.805475,1166.205478,580.315210,1420.125466,620.655318
3,MIMAT0000065,269.065417,171.629265,320.543051,250.390000,203.400175,408.828436,128.152588,116.815250,138.056086,...,291.893761,119.596430,150.741356,88.555496,154.221183,190.480835,238.560920,173.610390,360.567640,282.116055
4,MIMAT0000066,1382.093783,1816.758551,1237.959373,1767.555713,1314.029689,520.107185,2123.862146,730.506659,903.570740,...,1918.325791,1029.313541,802.399846,700.612538,936.105998,2156.266296,884.420975,657.091120,3222.395849,508.571372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,MIMAT0031892,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1864,MIMAT0031893,3.342428,0.697680,7.368806,4.118256,0.807144,3.225471,5.339691,0.548428,1.519884,...,3.502725,3.267661,1.036024,0.000000,0.000000,1.161468,0.831223,2.075025,0.000000,1.524952
1865,MIMAT0032026,2.506821,0.697680,1.473762,3.294606,0.000000,1.612735,1.334923,1.096857,0.000000,...,1.167575,0.000000,0.518012,0.000000,0.000000,0.000000,0.000000,0.000000,2.839115,6.099807
1866,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### RNA-Seq DataFrames

In [39]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_related_results['rna_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,3092,1541,1551,51.3913,16.1524,15.3086,1
1,ENSG00000000005.6,TNMD,protein_coding,4,1,3,0.2043,0.0642,0.0609,1
2,ENSG00000000419.13,DPM1,protein_coding,3044,1555,1489,190.1345,59.7596,56.6377,1
3,ENSG00000000457.14,SCYL3,protein_coding,1284,1041,987,14.0641,4.4204,4.1894,1
4,ENSG00000000460.17,C1orf112,protein_coding,712,738,751,8.9914,2.8260,2.6784,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,610,282,346,25.4504,7.9991,7.5812,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,9,5,4,0.0705,0.0222,0.0210,1


In [40]:
# Print the DataFrame of aggregated messenger RNA raw reads for this cohort
rna_seq_related_results['rna_agg_raw_reads']

Unnamed: 0,gene_id,gene_name,3eadce70-ba3d-4806-b112-6ac83bc89dc4,4c60d168-b545-46a8-8c37-81972a537a83,0a688b9b-06ea-4b83-bfce-9cee0d866a0c,f502949f-ca10-4617-a7ee-b8f142f7ec93,d393af6b-de4b-4ca6-bc06-a29068dfe086,c1778b3c-5649-4e26-b937-fd00ffd58387,318c3a49-64bc-4704-8739-d4cf31cef51f,da333184-1ddd-4cd7-9c0d-e4526a03a1c9,...,16ffdede-3d07-431a-a4ed-318c7a66c096,84ab5edd-38bf-4987-b9af-fa4d1cbdef2c,a127fb1e-aa3b-4b6d-98b5-c141ffba9ae7,ee89e01e-d54e-4166-a3ae-db9357639523,a8ab2eb8-9da3-494f-b740-dc3f4185acb2,5a827399-307b-412b-a2ce-f6c81d2750a6,76267112-851f-4f5b-af40-f3c90af1b2ce,e4086c26-d200-4e42-8249-ed8cbeec0951,2c3000b7-4db9-4f00-a82a-ca6802806631,dc8b305a-6f43-410e-9ce6-96ac24827550
0,ENSG00000000003.15,TSPAN6,3092,3513,2519,1340,146,3427,83,1620,...,4013,2343,1775,3148,1721,2947,1355,4271,2304,3372
1,ENSG00000000005.6,TNMD,4,2,0,13,0,12,1,7,...,12,2,7,12,78,13,1,92,7,24
2,ENSG00000000419.13,DPM1,3044,7502,1233,1757,1360,3875,2163,2535,...,3186,1699,1819,2303,2338,3136,2185,2343,4979,2335
3,ENSG00000000457.14,SCYL3,1284,2786,1522,1132,921,2473,626,2585,...,1463,1092,1789,1860,2104,1295,1005,3467,1544,1675
4,ENSG00000000460.17,C1orf112,712,908,455,598,687,1156,491,1144,...,1641,664,911,637,693,860,559,2193,1723,1693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,9,8,0,8,1,6,8,23,...,1,8,9,1,11,7,11,15,7,8


In [41]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this cohort
rna_seq_related_results['rna_agg_normalized_reads']

Unnamed: 0,gene_id,gene_name,3eadce70-ba3d-4806-b112-6ac83bc89dc4,4c60d168-b545-46a8-8c37-81972a537a83,0a688b9b-06ea-4b83-bfce-9cee0d866a0c,f502949f-ca10-4617-a7ee-b8f142f7ec93,d393af6b-de4b-4ca6-bc06-a29068dfe086,c1778b3c-5649-4e26-b937-fd00ffd58387,318c3a49-64bc-4704-8739-d4cf31cef51f,da333184-1ddd-4cd7-9c0d-e4526a03a1c9,...,16ffdede-3d07-431a-a4ed-318c7a66c096,84ab5edd-38bf-4987-b9af-fa4d1cbdef2c,a127fb1e-aa3b-4b6d-98b5-c141ffba9ae7,ee89e01e-d54e-4166-a3ae-db9357639523,a8ab2eb8-9da3-494f-b740-dc3f4185acb2,5a827399-307b-412b-a2ce-f6c81d2750a6,76267112-851f-4f5b-af40-f3c90af1b2ce,e4086c26-d200-4e42-8249-ed8cbeec0951,2c3000b7-4db9-4f00-a82a-ca6802806631,dc8b305a-6f43-410e-9ce6-96ac24827550
0,ENSG00000000003.15,TSPAN6,51.3913,38.5698,53.4133,14.0703,2.5807,33.0820,1.2382,19.6947,...,60.3483,45.8373,18.7416,35.6622,20.0214,34.1564,20.0523,44.9049,42.3897,55.0186
1,ENSG00000000005.6,TNMD,0.2043,0.0675,0.0000,0.4195,0.0000,0.3560,0.0458,0.2615,...,0.5546,0.1202,0.2271,0.4178,2.7887,0.4630,0.0455,2.9726,0.3958,1.2034
2,ENSG00000000419.13,DPM1,190.1345,309.5370,98.2539,69.3325,90.3423,140.5774,121.2661,115.8185,...,180.0561,124.9125,72.1784,98.0466,102.2170,136.5949,121.5182,92.5769,344.2597,143.1775
3,ENSG00000000457.14,SCYL3,14.0641,20.1579,21.2682,7.8332,10.7286,15.7325,6.1544,20.7105,...,14.4989,14.0788,12.4484,13.8861,16.1307,9.8914,9.8013,24.0222,18.7206,18.0108
4,ENSG00000000460.17,C1orf112,8.9914,7.5745,7.3305,4.7709,9.2266,8.4788,5.5654,10.5672,...,18.7501,9.8699,7.3085,5.4829,6.1255,7.5734,6.2854,17.5187,24.0859,20.9883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0705,0.0414,0.0000,0.0396,0.0083,0.0273,0.0563,0.1318,...,0.0071,0.0738,0.0448,0.0053,0.0603,0.0383,0.0768,0.0744,0.0607,0.0615


## Paired Normal Files

In [42]:
# Process the files related to Paired Normal tissue
processing_results = process_cohort_files('Paired Normal')
mir_seq_related_results = processing_results['mir_seq_files']
rna_seq_related_results = processing_results['rna_seq_files']

### miRNA-Seq DataFrames

In [43]:
# Print the sample DataFrame from the individual miRNA-Seq file processing
mir_seq_related_results['mir_reads']

Unnamed: 0,mirna_id,genome_assembly,chromosome,position_range,strand,cross_mapped,region_type,accession_id,read_count,reads_per_million,is_mirna_of_interest
0,hsa-let-7a-1,hg38,chr9,94175961-94175980,+,N,mature,MIMAT0000062,2,0.504087,1
1,hsa-let-7a-1,hg38,chr9,94175961-94175982,+,N,mature,MIMAT0000062,9,2.268390,1
2,hsa-let-7a-1,hg38,chr9,94175961-94175983,+,N,mature,MIMAT0000062,15,3.780650,1
3,hsa-let-7a-1,hg38,chr9,94175961-94175984,+,N,mature,MIMAT0000062,24,6.049040,1
4,hsa-let-7a-1,hg38,chr9,94175961-94175985,+,N,mature,MIMAT0000062,2,0.504087,1
...,...,...,...,...,...,...,...,...,...,...,...
4756,hsa-mir-99b,hg38,chr19,51692656-51692676,+,N,mature,MIMAT0004678,23,5.796996,1
4757,hsa-mir-99b,hg38,chr19,51692656-51692677,+,N,mature,MIMAT0004678,21,5.292910,1
4758,hsa-mir-99b,hg38,chr19,51692656-51692678,+,N,mature,MIMAT0004678,130,32.765631,1
4759,hsa-mir-99b,hg38,chr19,51692656-51692679,+,N,mature,MIMAT0004678,46,11.593992,1


In [44]:
# Print the DataFrame of aggregated microRNA raw reads for this cohort
mir_seq_related_results['mir_agg_raw_reads']

Unnamed: 0,accession_id,82bf32b4-0410-4426-9591-33210d995b98,7e17b2bf-91b2-4ea2-acc1-59e1537913ce,84d88e83-f246-45c8-a27c-5f146cb5a4b4,86723871-7176-4e6c-b9b2-dcf4acf99075,d4a3af62-0505-49f6-9f34-168bec5cb0d0,5ec7ec6c-20c8-48af-964c-54125c9313c2,803da4bd-a731-423b-b689-9c876dec117f,6c67482e-8a8a-4a5c-b8be-d573afb35d54,c5b7ba42-889d-4b7a-a810-298ae179e9c2,...,ea6c109d-9823-4fa7-987d-78f2ed377312,308d2039-4905-49de-861a-c70f97669068,12c76136-cc83-42c0-886b-4b4ea167a0d7,7c3c4540-a63e-4737-a23d-a9c999c53f39,cd746011-8fd7-473d-84e7-f39ec60b6717,600fc778-f0e3-4362-ad0c-cdb64fc76e7b,af1bb8e0-9d0f-4e54-9ca4-9b311129a412,3b8748ee-d541-4219-ba20-437311ae376a,acad335c-080a-442e-84d8-235fa3116bc6,8ca0e4c1-92ab-4653-87e0-cdcc630d8db6
0,MIMAT0000062,144753,201136,73447,235499,91981,79018,85993,78867,58133,...,104284,83952,54623,76427,112570,46473,66028,40047,47492,117350
1,MIMAT0000063,140564,135669,54023,143943,171047,63066,65761,150187,62035,...,52866,76506,135974,50010,85622,64125,47661,62972,78458,61340
2,MIMAT0000064,27161,34636,11915,34734,31835,14157,19605,33273,11976,...,9200,18203,17332,14022,21627,10892,10181,10381,12355,18166
3,MIMAT0000065,527,551,482,1052,342,253,282,402,307,...,569,413,188,148,405,558,252,123,180,615
4,MIMAT0000066,4604,3050,2441,6376,1970,1733,1973,1845,1616,...,3395,3570,1424,1261,2634,2226,2315,1073,1381,4148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1626,MIMAT0031893,3,6,2,16,5,2,5,3,4,...,3,1,2,1,1,6,1,1,1,2
1627,MIMAT0032026,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1628,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1629,MIMAT0032110,20,11,7,11,8,5,6,4,12,...,3,9,3,4,3,3,0,2,1,3


In [45]:
# Print the DataFrame of aggregated microRNA normalized reads for this cohort
mir_seq_related_results['mir_agg_normalized_reads']

Unnamed: 0,accession_id,82bf32b4-0410-4426-9591-33210d995b98,7e17b2bf-91b2-4ea2-acc1-59e1537913ce,84d88e83-f246-45c8-a27c-5f146cb5a4b4,86723871-7176-4e6c-b9b2-dcf4acf99075,d4a3af62-0505-49f6-9f34-168bec5cb0d0,5ec7ec6c-20c8-48af-964c-54125c9313c2,803da4bd-a731-423b-b689-9c876dec117f,6c67482e-8a8a-4a5c-b8be-d573afb35d54,c5b7ba42-889d-4b7a-a810-298ae179e9c2,...,ea6c109d-9823-4fa7-987d-78f2ed377312,308d2039-4905-49de-861a-c70f97669068,12c76136-cc83-42c0-886b-4b4ea167a0d7,7c3c4540-a63e-4737-a23d-a9c999c53f39,cd746011-8fd7-473d-84e7-f39ec60b6717,600fc778-f0e3-4362-ad0c-cdb64fc76e7b,af1bb8e0-9d0f-4e54-9ca4-9b311129a412,3b8748ee-d541-4219-ba20-437311ae376a,acad335c-080a-442e-84d8-235fa3116bc6,8ca0e4c1-92ab-4653-87e0-cdcc630d8db6
0,MIMAT0000062,36484.025999,56803.204359,28001.197110,72982.234100,32673.221604,37344.459135,36894.231078,31224.191645,40309.758235,...,49237.504237,46630.518467,26241.464915,42774.191206,67300.753721,24971.735995,46617.071136,34075.216144,25842.161047,53887.870976
1,MIMAT0000063,35428.216552,38314.543058,20595.921839,44608.604379,60758.814709,29805.432429,28213.942181,59460.454579,43015.427585,...,24960.587425,42494.692747,65323.342720,27989.287844,51189.705383,34456.836668,33649.606640,53581.654337,42691.911726,28167.720543
2,MIMAT0000064,6845.748480,9781.619335,4542.517238,10764.227956,11308.335523,6690.697156,8411.282316,13173.095575,8304.227628,...,4343.763557,10110.721933,8326.475477,7847.746337,12929.851656,5852.691852,7187.986934,8832.991707,6722.814362,8341.943452
3,MIMAT0000065,132.826827,155.608970,183.759406,326.019688,121.484239,119.569568,120.988607,159.155601,212.875577,...,268.652332,229.397800,90.317184,82.831726,242.132054,299.834933,177.916975,104.658318,97.944684,282.411933
4,MIMAT0000066,1160.407424,861.356363,930.615577,1975.952021,699.777632,819.027915,846.491203,730.452959,1120.543740,...,1602.943183,1982.930139,684.104610,705.748690,1574.755134,1196.115687,1634.435688,912.994904,751.453391,1904.788146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1626,MIMAT0031893,0.756130,1.694472,0.762488,4.958474,1.776085,0.945214,2.145188,1.187728,2.773624,...,1.416444,0.555443,0.960821,0.559674,0.597857,3.224032,0.706020,0.850881,0.544137,0.918413
1627,MIMAT0032026,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.395909,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1628,MIMAT0032029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1629,MIMAT0032110,5.040865,3.106531,2.668706,3.408951,2.841737,2.363035,2.574226,1.583637,8.320869,...,1.416444,4.998986,1.441233,2.238695,1.793571,1.612017,0.000000,1.701762,0.544137,1.377618


### RNA-Seq DataFrames

In [46]:
# Print the sample DataFrame from the individual RNA-Seq file processing
rna_seq_related_results['rna_reads']

Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,is_mrna_of_interest
0,ENSG00000000003.15,TSPAN6,protein_coding,4158,2130,2028,72.5826,20.1726,18.5527,1
1,ENSG00000000005.6,TNMD,protein_coding,75,38,37,4.0234,1.1182,1.0284,1
2,ENSG00000000419.13,DPM1,protein_coding,1456,706,758,95.5158,26.5463,24.4146,1
3,ENSG00000000457.14,SCYL3,protein_coding,1446,1011,953,16.6346,4.6232,4.2519,1
4,ENSG00000000460.17,C1orf112,protein_coding,299,414,440,3.9657,1.1022,1.0137,1
...,...,...,...,...,...,...,...,...,...,...
60655,ENSG00000288669.1,AC008763.4,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60656,ENSG00000288670.1,AL592295.6,lncRNA,341,138,206,14.9423,4.1528,3.8194,0
60657,ENSG00000288671.1,AC006486.3,protein_coding,0,0,0,0.0000,0.0000,0.0000,1
60658,ENSG00000288674.1,AL391628.1,protein_coding,6,6,0,0.0494,0.0137,0.0126,1


In [47]:
# Print the DataFrame of aggregated messenger RNA raw reads for this cohort
rna_seq_related_results['rna_agg_raw_reads']

Unnamed: 0,gene_id,gene_name,68881256-49b8-4a19-87a9-afac4f1841d2,1320db11-22a5-417f-8ec7-65c0bf4681a2,23bf74db-bb4e-44c5-8473-e651b818e460,3aad7b0b-9f82-41d0-b3e5-5614afcac6a8,15f0e499-8d53-4e97-a392-334875d25cf4,d7a48283-c113-4745-be6b-553966e6b457,736fca14-66fb-481a-985e-7253f75243a8,38854c85-fc09-4a51-93a7-257762517583,...,ccb87d7a-2729-4017-bc17-10e5a67bd5cc,8a84b9a5-d453-416b-b481-f15402c2eb54,3071e512-94ea-4820-9573-668235188e34,a37587aa-2e1f-42f5-a691-c4a41ae79ea9,4df2233b-3bbc-4d20-9abc-2a09b3f37383,b70b68a1-28c4-4ed3-a04c-b622e583f10b,8ebe0bf6-11fa-418d-918c-5c73f0e7e9ac,ea1fadc2-1cdc-4658-9619-eeb26ae09da8,baec6a46-7c48-41ed-a8a9-eef52d32cba3,cbfb8ffe-ae83-4a16-aaa6-f21ea893cb8d
0,ENSG00000000003.15,TSPAN6,4158,4435,5383,5005,5784,2407,6679,8064,...,3542,2829,3820,2849,5384,3919,6854,3791,3607,6447
1,ENSG00000000005.6,TNMD,75,653,319,235,209,303,569,1408,...,327,240,105,160,712,46,318,228,3612,3496
2,ENSG00000000419.13,DPM1,1456,1705,2465,1762,1861,1134,2240,2647,...,1504,1067,1705,1177,1500,1242,3435,2184,1246,3832
3,ENSG00000000457.14,SCYL3,1446,1094,1191,2052,2197,560,1980,2368,...,705,928,1678,1187,1636,982,2107,1515,670,1002
4,ENSG00000000460.17,C1orf112,299,282,272,361,411,216,458,590,...,226,255,292,244,338,222,698,294,202,370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,6,0,0,0,0,2,...,0,0,0,0,1,0,0,0,2,2
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,6,4,5,5,4,26,8,10,...,4,12,3,9,13,4,9,19,18,12


In [48]:
# Print the DataFrame of aggregated messenger RNA normalized reads for this cohort
rna_seq_related_results['rna_agg_normalized_reads']

Unnamed: 0,gene_id,gene_name,68881256-49b8-4a19-87a9-afac4f1841d2,1320db11-22a5-417f-8ec7-65c0bf4681a2,23bf74db-bb4e-44c5-8473-e651b818e460,3aad7b0b-9f82-41d0-b3e5-5614afcac6a8,15f0e499-8d53-4e97-a392-334875d25cf4,d7a48283-c113-4745-be6b-553966e6b457,736fca14-66fb-481a-985e-7253f75243a8,38854c85-fc09-4a51-93a7-257762517583,...,ccb87d7a-2729-4017-bc17-10e5a67bd5cc,8a84b9a5-d453-416b-b481-f15402c2eb54,3071e512-94ea-4820-9573-668235188e34,a37587aa-2e1f-42f5-a691-c4a41ae79ea9,4df2233b-3bbc-4d20-9abc-2a09b3f37383,b70b68a1-28c4-4ed3-a04c-b622e583f10b,8ebe0bf6-11fa-418d-918c-5c73f0e7e9ac,ea1fadc2-1cdc-4658-9619-eeb26ae09da8,baec6a46-7c48-41ed-a8a9-eef52d32cba3,cbfb8ffe-ae83-4a16-aaa6-f21ea893cb8d
0,ENSG00000000003.15,TSPAN6,72.5826,68.3142,62.8648,84.4328,75.3017,36.7106,90.9645,86.9116,...,57.2074,65.8003,70.0346,51.1543,77.3533,63.3806,63.4952,51.3628,45.4672,44.7737
1,ENSG00000000005.6,TNMD,4.0234,30.9113,11.4488,12.1832,8.3620,14.2019,23.8155,46.6355,...,16.2307,17.1551,5.9160,8.8287,31.4370,2.2863,9.0534,9.4933,139.9220,74.6146
2,ENSG00000000419.13,DPM1,95.5158,98.6978,108.1847,111.7066,91.0519,64.9972,114.6501,107.2129,...,91.2887,93.2663,117.4734,79.4204,80.9899,75.4864,119.5885,111.2022,59.0249,100.0131
3,ENSG00000000457.14,SCYL3,16.6346,11.1053,9.1662,22.8129,18.8496,5.6286,17.7714,16.8192,...,7.5039,14.2245,20.2739,14.0455,15.4900,10.4662,12.8634,13.5270,5.5657,4.5859
4,ENSG00000000460.17,C1orf112,3.9657,3.3004,2.4135,4.6271,4.0655,2.5030,4.7394,4.8315,...,2.7734,4.5064,4.0675,3.3287,3.6897,2.7279,4.9130,3.0265,1.9346,1.9524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19958,ENSG00000288669.1,AC008763.4,0.0000,0.0000,0.0797,0.0000,0.0000,0.0000,0.0000,0.0245,...,0.0000,0.0000,0.0000,0.0000,0.0163,0.0000,0.0000,0.0000,0.0287,0.0158
19959,ENSG00000288671.1,AC006486.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
19960,ENSG00000288674.1,AL391628.1,0.0494,0.0291,0.0275,0.0398,0.0246,0.1870,0.0514,0.0508,...,0.0305,0.1316,0.0259,0.0762,0.0881,0.0305,0.0393,0.1214,0.1070,0.0393
