# **Defining the Expressed Molecules in TCGA-BRCA**
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    AGGREGATED_READS_FILES,
    BRCA_INTERIM_FILES_DIRS,
    BRCA_PROCESSED_DATA_DIR,
    BRCA_PROCESSED_FILES_DIRS,
    EXPRESSED_MOLECULES_FILES,
    FILTER_BY_EXPR_PARAMETERS,
)



# Functions

In [2]:
def aggregate_all_groups_reads(experimental_strategy):
    """
    Aggregate read count data across multiple groups for a given experimental strategy.

    Parameters:
    -----------
    experimental_strategy : str
        The experimental strategy for which the reads are to be aggregated.

    Returns:
    --------
    tuple (pd.DataFrame, pd.DataFrame)
        A tuple containing:
        - df_all_reads: A DataFrame where rows are molecules and columns are sample IDs.
          All missing values are filled with 0, and data is cast to integers.
        - df_all_samples: A DataFrame with sample IDs as the index and a single 'group' column 
          indicating the corresponding group for each sample.
    """
    # Define the specific experimental strategy parameters
    if experimental_strategy == 'miRNA-Seq':
        primary_key = 'accession_id'
        drop_column = ''
        file_name = AGGREGATED_READS_FILES['mir']
    else:
        primary_key = 'gene_id'
        drop_column = 'gene_name'
        file_name = AGGREGATED_READS_FILES['rna']

    # Initialize the DataFrames of all reads and all samples
    df_all_reads = pd.DataFrame(columns=[primary_key])
    df_all_samples = pd.DataFrame(columns=['sample_id', 'group'])

    # Process the aggregated reads from each group
    for group in BRCA_INTERIM_FILES_DIRS.keys():
        # Create a DataFrame for the group aggregated reads
        df_group_reads = pd.read_csv(
            os.path.join(BRCA_INTERIM_FILES_DIRS[group], file_name)
        )
        
        # Redefine the DataFrame's primary key, if necessary
        if drop_column != '':
            df_group_reads[primary_key] = (
                df_group_reads[primary_key] + ';' + df_group_reads[drop_column]
            )
            df_group_reads = df_group_reads.drop(columns=[drop_column])
        
        # Merge the group reads into the all reads DataFrame
        df_all_reads = df_group_reads \
            .merge(
                right=df_all_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='outer',
            )
        
        # Get the sample IDs related to the group
        sample_ids = list(df_group_reads.columns)
        sample_ids.remove(primary_key)
        
        # Create a DataFrame that associates the sample IDs to the group
        df_group_samples = pd.DataFrame(data={'sample_id': sample_ids})
        df_group_samples['group'] = group
        
        # Concat the group samples into the all samples DataFrame
        df_all_samples = pd.concat(
            objs=[df_all_samples, df_group_samples], axis=0
        )
    
    # Set the primary key as reads DataFrame index and fill NaN reads
    df_all_reads = df_all_reads \
        .set_index(keys=primary_key) \
        .fillna(value=0) \
        .astype(dtype=int)
        
    # Set the sample IDs as sample DataFrame index
    df_all_samples = df_all_samples.set_index(keys='sample_id')
    
    return df_all_reads, df_all_samples

In [3]:
def filter_by_expression(df_reads, df_samples):
    """
    Filter molecules based on expression levels across all groups using edgeR's `filterByExpr`.

    Parameters:
    -----------
    df_reads : pd.DataFrame
        A DataFrame of raw read counts.
        Rows represent genes or accession IDs (as index), columns represent sample IDs.

    df_samples : pd.DataFrame
        A DataFrame mapping each sample ID (as index) to its experimental group.

    Returns:
    --------
    pd.DataFrame
        A DataFrame with two columns:
        - The original index (gene_id or accession_id).
        - 'is_expressed': A binary flag indicating whether the molecule is considered expressed.
    """
    # Import the edgeR library
    edgeR = importr('edgeR')
    
    # Convert Pandas data structures to R
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_counts = ro.conversion.py2rpy(df_reads)
        r_group = ro.FactorVector(df_samples.loc[df_reads.columns, 'group'].values)
    
    # Create a DGEList object
    dge = edgeR.DGEList(counts=r_counts, group=r_group)

    # Define the expressed molecules based on the expression of all groups
    min_count=FILTER_BY_EXPR_PARAMETERS['min_count']
    min_total_count=FILTER_BY_EXPR_PARAMETERS['min_total_count']
    large_n=FILTER_BY_EXPR_PARAMETERS['large_n']
    min_prop=FILTER_BY_EXPR_PARAMETERS['min_prop']
    r_keep = edgeR.filterByExpr(
        y=dge, # Matrix of counts/reads
        group=dge.rx2('samples').rx2('group'), # Group membership
        min_count=min_count, # Min count required
        min_total_count=min_total_count, # Min total count required
        large_n=large_n, # Number of samples per group
        min_prop=min_prop, # Min proportion of samples in the smallest group that express it
    )

    # Convert R logical vector to NumPy boolean array
    keep = np.array(r_keep, dtype=bool)

    # Create a DataFrame of flagged expressed molecules
    df_expression = pd.Series(data=keep, index=df_reads.index, name='is_expressed')
    df_expression = df_expression.to_frame().reset_index()
    df_expression['is_expressed'] = df_expression['is_expressed'].astype('int')

    return df_expression

In [4]:
def define_expressed_molecules(experimental_strategy):
    """
    Define and flag expressed molecules across experimental groups based on a 
    specified experimental strategy.

    Parameters
    -----------
    experimental_strategy : str
        The experimental method used for quantifying expression.

    Returns:
    --------
    expression_data : dict
        A dictionary containing:
        - 'expressed-molecules': DataFrame of all flagged expressed molecules.
        - <group_name>: DataFrame of expressed molecules with read counts for 
        each experimental group, keyed by group name.
    """
    # Aggregate the reads from all experimental groups
    df_all_reads, df_all_samples = aggregate_all_groups_reads(experimental_strategy)

    # Flag the expressed molecules of all experimental groups
    df_expression = filter_by_expression(df_all_reads, df_all_samples)
    
    # Define the specific experimental strategy parameters
    if experimental_strategy == 'miRNA-Seq':
        primary_key = ['accession_id']
        reads_file_name = AGGREGATED_READS_FILES['mir']
        expression_file_path = os.path.join(
            BRCA_PROCESSED_DATA_DIR, EXPRESSED_MOLECULES_FILES['mir']
        )
    else:
        primary_key = ['gene_id', 'gene_name']
        reads_file_name = AGGREGATED_READS_FILES['rna']
        expression_file_path = os.path.join(
            BRCA_PROCESSED_DATA_DIR, EXPRESSED_MOLECULES_FILES['rna']
        )
        
        # Expand the primary key to two different columns
        df_expression[primary_key] = \
            df_expression[df_expression.columns[0]].str.split(';', expand=True)
        df_expression = df_expression[primary_key + ['is_expressed']]
    
    # Store the DataFrame of flagged expressed molecules
    df_expression.to_csv(expression_file_path, index=False)
    
    # Initialize a dictionary for all groups expression data
    expression_data = dict()
    expression_data['expressed-molecules'] = df_expression 
    
    # Apply the expression flag to each aggregated reads file
    for group in BRCA_INTERIM_FILES_DIRS.keys():
        # Create a DataFrame for the group aggregated reads
        df_agg_reads = pd.read_csv(
            os.path.join(BRCA_INTERIM_FILES_DIRS[group], reads_file_name)
        )
        
        # Flag the molecules related to the aggregated reads
        df_agg_reads = df_expression \
            .merge(
                right=df_agg_reads,
                left_on=primary_key,
                right_on=primary_key,
                how='inner',
            )
        
        # Store the DataFrame in the expression data dictionary
        expression_data[group] = df_agg_reads
        
        # Store the DataFrame of flagged aggregated reads in a CSV file
        data_dir = BRCA_PROCESSED_FILES_DIRS[group]
        df_agg_reads.to_csv(
            os.path.join(data_dir, reads_file_name), index=False
        )

    return expression_data

# Expressed Messenger RNAs

In [5]:
# Identify the expressed messenger RNAs in the experimental groups
rna_expression_data = define_expressed_molecules('RNA-Seq')

In [6]:
# Print the total number of expressed messenger RNAs
print(
    'Total of expressed messenger RNAs:',
    rna_expression_data['expressed-molecules']['is_expressed'].sum()
)

Total of expressed messenger RNAs: 16906


In [7]:
# Print the DataFrame of flagged expressed messenger RNAs
rna_expression_data['expressed-molecules']

Unnamed: 0,gene_id,gene_name,is_expressed
0,ENSG00000000003.15,TSPAN6,1
1,ENSG00000000005.6,TNMD,1
2,ENSG00000000419.13,DPM1,1
3,ENSG00000000457.14,SCYL3,1
4,ENSG00000000460.17,C1orf112,1
...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0
19958,ENSG00000288669.1,AC008763.4,0
19959,ENSG00000288671.1,AC006486.3,0
19960,ENSG00000288674.1,AL391628.1,1


In [8]:
# Print the DataFrame of flagged aggregated reads of Basal-like
rna_expression_data['basal-like']

Unnamed: 0,gene_id,gene_name,is_expressed,ff1f2f31-8607-4627-b487-c5e39e7c30f5,fe7dfda5-6846-4238-9ccc-472978eb78a1,1d15f05d-2ac2-4b83-987e-6f4b157b0b74,8a7fe670-bcef-4fcb-9759-9892ff5f4f61,6e112ec6-1791-4764-8a62-7ad4dfea2d3b,511ca25f-0b7f-4912-9b79-ed551721a420,c6fd0f95-d74a-49e3-9a70-fec12e63ff1a,...,37939d7d-074b-42b7-8d68-95a8afef4c38,f2a4b38d-bfc0-4363-9a58-b5fa2af26ff3,1f5bbb4e-9b0e-4953-a360-83fd7b6a2267,4f464ced-080f-41f1-9a00-955db0d5fda6,08b0fe4c-ee3c-4510-b75a-6d240cb038cc,59858555-bc6a-4286-8280-0f8341123cac,77f150b1-5f40-442d-91c5-6d3571513513,2cd221ee-4d29-4f40-ab4c-780cc9045c5a,5fbe4a11-de4d-418e-9bc3-75b768d4a665,c7aeba7e-b78e-4586-a4dd-8a04cc440737
0,ENSG00000000003.15,TSPAN6,1,2127,5469,3345,6421,2162,2300,2581,...,2480,4679,3209,8658,1494,9201,6342,1065,4506,3600
1,ENSG00000000005.6,TNMD,1,0,9,23,140,29,7,5,...,11,2,69,13,78,9,5,254,1,77
2,ENSG00000000419.13,DPM1,1,742,2153,1792,2061,3375,1418,2766,...,1845,2570,2785,6747,3955,4186,1913,1757,3746,2078
3,ENSG00000000457.14,SCYL3,1,911,985,955,1620,2340,674,2002,...,949,1342,1361,2791,812,2054,640,1472,1111,2576
4,ENSG00000000460.17,C1orf112,1,328,1452,576,1324,1633,691,799,...,1024,1121,1113,2377,404,4333,665,262,1404,948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,1,1,6,7,5,13,8,5,...,8,14,6,12,8,11,2,9,7,17


In [9]:
# Print the DataFrame of flagged aggregated reads of HER2-enriched
rna_expression_data['her2-enriched']

Unnamed: 0,gene_id,gene_name,is_expressed,0a7421d4-5722-427a-9643-da1074e5c25c,111b2865-e7dc-45c9-b206-20b5713714bc,09a491b5-1bfb-47d9-963f-69f6c7e6c1e5,f0a63361-78bb-4a7f-9d78-31f7b2980ba2,7b188e2a-4a56-49f9-b527-cd61a646f6c7,fd371070-c312-4eca-b3da-d41b1c8a86a7,a4589532-53e0-4025-b8d3-e11e79e1fc9c,...,4c8d0d1a-dbd3-4124-8458-dfb61f61f11c,be6f8b24-e668-41c6-89f7-f4cdc8533cae,5c0579f7-6184-4afa-805c-7aeec4a4c5d2,0710056c-2f04-4182-90ca-45492dd6444c,cee943a8-ea26-4b2f-b8a4-66300ccacfb5,1936440a-9cfc-4b04-af40-3fbc22fe87f6,0a74ea3f-dadc-4c9f-96ea-62a77c1e602c,788a4858-c6c6-4a8d-9aef-dbd32d74776b,d3badb09-df1e-488e-ade9-97f1925b5649,eb166054-ff70-4a86-883b-9c25a7d2b0e5
0,ENSG00000000003.15,TSPAN6,1,1637,735,3834,745,8421,11016,1075,...,1548,585,1379,2951,1349,3606,6278,3926,1412,2825
1,ENSG00000000005.6,TNMD,1,0,0,0,0,0,2,0,...,24,6,0,5,0,2,18,8,0,19
2,ENSG00000000419.13,DPM1,1,2061,1329,3161,1537,3904,3511,2191,...,3668,793,2176,4550,878,1816,2633,1212,3381,9021
3,ENSG00000000457.14,SCYL3,1,1944,755,1411,1798,2519,2828,2598,...,1009,620,723,3296,378,2893,1425,856,1237,3057
4,ENSG00000000460.17,C1orf112,1,656,597,637,497,1186,3257,381,...,650,184,383,1247,243,447,480,477,1087,530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,1,1,3,0,5,7,2,4,...,12,0,0,31,0,0,9,4,6,10


In [10]:
# Print the DataFrame of flagged aggregated reads of Luminal A
rna_expression_data['luminal-a']

Unnamed: 0,gene_id,gene_name,is_expressed,8f79405a-c78e-4a9e-a673-969658a0f90b,1a4285b8-765b-4954-b9f2-dc8609572889,987d401a-3e68-414e-8067-afe277b02fad,196aed5b-6812-47c5-ad7b-4dd1743ed0b5,892928f7-8239-4d15-812b-046f9192d7de,c64a0adb-56a8-4966-9dd9-1acd789c1bb3,65bc2a34-2355-4193-b20e-439f2d7f6df2,...,1dacccf9-58ae-490f-9f3f-31c82f93b3b8,5b6828f2-4da6-49a4-a19c-569e089ff0b0,4c75e72b-db9f-425b-9f14-946ee801071e,072cae2e-f4c1-4a3f-85e1-6f7e35a65108,d914654b-ec4f-4be5-a525-2997a4b39279,81f86e31-55db-4483-b6e0-55451ffbd1db,03d18286-1038-4c0c-9c05-d1269e280250,bc0a4326-63d2-4ec4-9af6-7b421aa8aa49,8d87fbd5-8ca7-4ff6-b8b1-79f098dbca9f,e5b0c2dc-c652-40a0-bb80-d7e87830b406
0,ENSG00000000003.15,TSPAN6,1,2610,2436,5269,2177,4278,1847,3324,...,1004,3100,787,3498,1134,1890,1825,1236,2969,2898
1,ENSG00000000005.6,TNMD,1,1,59,57,33,4,86,32,...,101,7,114,1,20,5,63,4,760,27
2,ENSG00000000419.13,DPM1,1,1406,1254,1896,1502,2609,1124,1658,...,2133,1569,1989,2713,1312,6022,1737,2412,1581,1312
3,ENSG00000000457.14,SCYL3,1,880,2138,1262,1027,1005,718,961,...,2901,2404,2530,3231,964,1725,1596,1480,1535,2001
4,ENSG00000000460.17,C1orf112,1,243,931,334,457,495,298,702,...,1361,410,764,905,467,728,483,328,423,767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,1,0,21,3,5,5,6,9,...,9,24,13,3,8,7,7,8,11,10


In [11]:
# Print the DataFrame of flagged aggregated reads of Luminal B
rna_expression_data['luminal-b']

Unnamed: 0,gene_id,gene_name,is_expressed,3eadce70-ba3d-4806-b112-6ac83bc89dc4,4c60d168-b545-46a8-8c37-81972a537a83,0a688b9b-06ea-4b83-bfce-9cee0d866a0c,f502949f-ca10-4617-a7ee-b8f142f7ec93,d393af6b-de4b-4ca6-bc06-a29068dfe086,c1778b3c-5649-4e26-b937-fd00ffd58387,318c3a49-64bc-4704-8739-d4cf31cef51f,...,16ffdede-3d07-431a-a4ed-318c7a66c096,84ab5edd-38bf-4987-b9af-fa4d1cbdef2c,a127fb1e-aa3b-4b6d-98b5-c141ffba9ae7,ee89e01e-d54e-4166-a3ae-db9357639523,a8ab2eb8-9da3-494f-b740-dc3f4185acb2,5a827399-307b-412b-a2ce-f6c81d2750a6,76267112-851f-4f5b-af40-f3c90af1b2ce,e4086c26-d200-4e42-8249-ed8cbeec0951,2c3000b7-4db9-4f00-a82a-ca6802806631,dc8b305a-6f43-410e-9ce6-96ac24827550
0,ENSG00000000003.15,TSPAN6,1,3092,3513,2519,1340,146,3427,83,...,4013,2343,1775,3148,1721,2947,1355,4271,2304,3372
1,ENSG00000000005.6,TNMD,1,4,2,0,13,0,12,1,...,12,2,7,12,78,13,1,92,7,24
2,ENSG00000000419.13,DPM1,1,3044,7502,1233,1757,1360,3875,2163,...,3186,1699,1819,2303,2338,3136,2185,2343,4979,2335
3,ENSG00000000457.14,SCYL3,1,1284,2786,1522,1132,921,2473,626,...,1463,1092,1789,1860,2104,1295,1005,3467,1544,1675
4,ENSG00000000460.17,C1orf112,1,712,908,455,598,687,1156,491,...,1641,664,911,637,693,860,559,2193,1723,1693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,1,9,8,0,8,1,6,8,...,1,8,9,1,11,7,11,15,7,8


In [12]:
# Print the DataFrame of flagged aggregated reads of Paired Normal
rna_expression_data['paired-normal']

Unnamed: 0,gene_id,gene_name,is_expressed,68881256-49b8-4a19-87a9-afac4f1841d2,1320db11-22a5-417f-8ec7-65c0bf4681a2,23bf74db-bb4e-44c5-8473-e651b818e460,3aad7b0b-9f82-41d0-b3e5-5614afcac6a8,15f0e499-8d53-4e97-a392-334875d25cf4,d7a48283-c113-4745-be6b-553966e6b457,736fca14-66fb-481a-985e-7253f75243a8,...,ccb87d7a-2729-4017-bc17-10e5a67bd5cc,8a84b9a5-d453-416b-b481-f15402c2eb54,3071e512-94ea-4820-9573-668235188e34,a37587aa-2e1f-42f5-a691-c4a41ae79ea9,4df2233b-3bbc-4d20-9abc-2a09b3f37383,b70b68a1-28c4-4ed3-a04c-b622e583f10b,8ebe0bf6-11fa-418d-918c-5c73f0e7e9ac,ea1fadc2-1cdc-4658-9619-eeb26ae09da8,baec6a46-7c48-41ed-a8a9-eef52d32cba3,cbfb8ffe-ae83-4a16-aaa6-f21ea893cb8d
0,ENSG00000000003.15,TSPAN6,1,4158,4435,5383,5005,5784,2407,6679,...,3542,2829,3820,2849,5384,3919,6854,3791,3607,6447
1,ENSG00000000005.6,TNMD,1,75,653,319,235,209,303,569,...,327,240,105,160,712,46,318,228,3612,3496
2,ENSG00000000419.13,DPM1,1,1456,1705,2465,1762,1861,1134,2240,...,1504,1067,1705,1177,1500,1242,3435,2184,1246,3832
3,ENSG00000000457.14,SCYL3,1,1446,1094,1191,2052,2197,560,1980,...,705,928,1678,1187,1636,982,2107,1515,670,1002
4,ENSG00000000460.17,C1orf112,1,299,282,272,361,411,216,458,...,226,255,292,244,338,222,698,294,202,370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,ENSG00000288661.1,AL451106.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19958,ENSG00000288669.1,AC008763.4,0,0,0,6,0,0,0,0,...,0,0,0,0,1,0,0,0,2,2
19959,ENSG00000288671.1,AC006486.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19960,ENSG00000288674.1,AL391628.1,1,6,4,5,5,4,26,8,...,4,12,3,9,13,4,9,19,18,12


# Expressed MicroRNAs

In [13]:
# Identify the expressed microRNAs in the experimental groups
mir_expression_data = define_expressed_molecules('miRNA-Seq')

In [14]:
# Print the total number of expressed microRNAs
print(
    'Total of expressed microRNAs:',
    mir_expression_data['expressed-molecules']['is_expressed'].sum()
)

Total of expressed microRNAs: 436


In [15]:
# Print the DataFrame of flagged expressed microRNAs
mir_expression_data['expressed-molecules']

Unnamed: 0,accession_id,is_expressed
0,MIMAT0000062,1
1,MIMAT0000063,1
2,MIMAT0000064,1
3,MIMAT0000065,1
4,MIMAT0000066,1
...,...,...
2123,MIMAT0031893,1
2124,MIMAT0032026,0
2125,MIMAT0032029,0
2126,MIMAT0032110,1


In [16]:
# Print the DataFrame of flagged aggregated reads of Basal-like
mir_expression_data['basal-like']

Unnamed: 0,accession_id,is_expressed,0961f723-1c0b-4a2a-8a86-861bf407085b,5359d1b9-dfba-4f3e-b0c4-7fbf51dfb72c,f83b6eb6-444b-4fe0-9f53-3f152537500d,95434759-f95d-4e7a-bc24-c8d8a2dc6d2c,e6a87bcc-c44d-4a20-8f52-e9c4f2923246,6ede932d-8018-4760-8001-179f8bfd3861,5cc5f8c9-a224-4d70-b406-89f0d35cd711,f9b78f39-e96f-4ca4-9149-5c19c68b0d5e,...,0bc59ba8-158b-4309-b4d5-bb541f892fbf,3f7848d9-ee82-4648-92d8-e0188275abf4,5fab925b-4cc8-4126-8fb9-8f7655f7bde1,54f6fcc7-e68d-42dc-b1c9-90fa90b083b6,8a37c9f4-a187-4536-9c29-41a3aecba27d,8dbb8398-24b8-4204-bd68-97b22c3353a0,5a79be00-de6f-44ab-a0ac-29ec7bbd2087,118c6c30-50e6-47e2-8a1e-2f0af7b992ef,740e84ea-f42a-4e1c-99bd-39f35627be12,d2d4e49b-9070-4123-a0a1-26c1ae412f90
0,MIMAT0000062,1,132234,47073,33465,44783,21602,21668,32636,14840,...,47314,23339,205658,29895,38464,12688,19698,28108,83339,15691
1,MIMAT0000063,1,123894,26174,39633,34232,28656,36921,20699,25232,...,54624,39282,86614,18329,26069,9058,12509,70561,62819,23004
2,MIMAT0000064,1,685,5601,4763,3762,9756,2013,7859,1524,...,10168,3716,5474,1095,4397,2031,4188,2662,12266,2375
3,MIMAT0000065,1,1269,569,700,602,502,366,1170,420,...,737,506,1615,200,680,194,643,409,650,242
4,MIMAT0000066,1,4021,1783,1682,2045,1241,297,2031,894,...,1925,695,12396,2254,10772,524,1855,1149,3212,798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,MIMAT0031890,0,15,1,0,1,8,0,0,10,...,0,2,11,0,1,0,0,0,2,1
1915,MIMAT0031893,1,4,8,29,4,1,8,5,2,...,6,2,15,9,6,0,2,6,13,3
1916,MIMAT0032026,0,2,0,3,0,8,10,0,6,...,1,7,13,1,4,1,1,2,6,1
1917,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Print the DataFrame of flagged aggregated reads of HER2-enriched
mir_expression_data['her2-enriched']

Unnamed: 0,accession_id,is_expressed,f3a7612e-35c3-43be-8f96-795874f8225c,3abbb570-87ad-44c5-b66d-141f63b0ca8b,dc300e50-f729-4d0e-8a33-606e95d91c95,a00a04b4-8de5-4f18-9dc2-531e21aa5167,0d003664-43ff-4eb5-b6c0-28ba7f26ee64,557f6cb1-95e3-4c8c-b253-538a415b0d07,ed76d786-d744-4067-b288-5fe2dc06a310,e4b1e8c9-5921-436d-8f6a-9af5b5baea8e,...,3be37b85-90eb-4f47-909a-8be730172d92,3d4b6861-2763-451d-ae0a-6a5adfc591ec,475e779f-c6b7-4f7c-9616-f2c47b9b97a5,eaf6af76-2b28-4128-9371-a3d928d72370,d0982681-8799-412d-a868-81078e4a6560,a44df6fa-36a7-4500-90fd-d62baf48ef52,646eb67c-2e86-4f89-867f-78c31eff5fe0,91b36078-362b-4909-ac90-1b2d1ea4a545,aafbf8fb-290d-408e-b104-241b7d492895,ff79f645-81f1-4a3b-ae72-adc169f30143
0,MIMAT0000062,1,15868,28998,49811,10962,20083,81120,21828,76258,...,130734,54800,15681,25558,81617,11903,18136,45392,36868,31019
1,MIMAT0000063,1,20715,89908,67531,6183,38497,50804,47014,32421,...,95620,63216,17932,19561,60530,14327,16079,40233,26816,40579
2,MIMAT0000064,1,915,3128,6580,601,2147,5503,2754,4303,...,3829,1070,3705,1883,7084,912,2828,4304,3829,5011
3,MIMAT0000065,1,192,982,512,138,121,1335,561,737,...,717,765,205,194,669,157,418,1269,407,593
4,MIMAT0000066,1,1364,1527,725,1017,919,2217,1151,3200,...,3627,558,1013,881,2897,1609,441,782,1281,1689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749,MIMAT0031892,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1750,MIMAT0031893,1,4,7,4,6,1,12,8,33,...,14,3,3,2,14,7,11,0,1,4
1751,MIMAT0032026,0,0,2,1,5,1,5,3,2,...,1,6,2,2,4,1,2,3,2,6
1752,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Print the DataFrame of flagged aggregated reads of Luminal A
mir_expression_data['luminal-a']

Unnamed: 0,accession_id,is_expressed,cfe94e8f-e277-45cd-9318-877c37903437,87ece8e3-9cff-4ac2-812c-084dfa22717d,d19d100c-9a44-432c-9c16-5783aa4a07b1,4b1e2d17-a537-461c-bbcc-d33f30015938,aea5c686-fcb7-4ca4-9fba-3416782f09b9,7a4675e7-4174-4561-ae9c-b7636fbea718,3a6a7205-8a41-4691-aba5-f1e252dece0e,539908ab-5742-465f-91a1-819917da850e,...,96c28baa-65ae-4b13-bd2a-9085ca353f94,da324620-e93a-4cb4-a195-e15702ddddbf,d52be4fa-bd0b-44cf-95ad-30ac7c049afd,297b8ef7-71ca-4ea0-9315-da31f7f8914e,8b2a6497-fe35-4377-b3e4-ff8aa1544521,a1ef8fb3-63d5-45e0-acf5-5a4a52a6a28a,b8e351ec-ce57-4cc1-a87a-1168c9f20972,34ada12d-9f97-4bd1-9185-ddd7431c7074,c0eda172-ffc4-4edd-888d-eb7e09d3d9e7,33cba3fa-57ce-4e2a-89d8-767f1f74b644
0,MIMAT0000062,1,30350,280078,60148,68096,71727,101623,13481,33405,...,46942,34432,50148,43475,34056,56036,30612,37563,56115,79583
1,MIMAT0000063,1,57654,374426,58401,67105,303174,82306,15372,65094,...,16010,58362,89660,45452,69684,84267,68636,14201,77992,165926
2,MIMAT0000064,1,8914,2636,4870,500,7946,1900,1767,1782,...,1361,3174,3309,6773,3175,5823,6209,1936,4104,2408
3,MIMAT0000065,1,126,837,255,639,248,368,251,302,...,757,763,425,225,434,213,290,202,717,434
4,MIMAT0000066,1,1089,3703,1123,1666,1107,2425,1470,557,...,1725,1923,1887,875,1237,2897,977,1317,1793,1431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,MIMAT0031893,1,3,6,1,3,4,3,2,4,...,3,3,1,4,5,0,9,0,10,1
1989,MIMAT0032026,0,0,0,0,0,0,2,1,0,...,1,2,0,0,1,0,0,1,1,1
1990,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1991,MIMAT0032110,1,0,14,4,5,3,12,11,22,...,4,22,3,4,3,0,2,2,3,32


In [19]:
# Print the DataFrame of flagged aggregated reads of Luminal B
mir_expression_data['luminal-b']

Unnamed: 0,accession_id,is_expressed,7262f244-82d5-4d3c-8762-782c5381a6e5,ed716c0b-4b09-4902-86b8-6955488d60c3,6a813732-f636-4369-8a31-04ef69340706,d7e60e41-5da6-49df-a8e4-c732699a6f61,ce2f555b-bfd9-4a8a-84b1-54feaf484cd2,f5b1694d-7bd1-43be-8ec6-2bcaad95d802,42f12ac2-0c43-4fea-bf8f-edc15cd33d47,0397db08-993e-4a47-802e-36f88161ec6f,...,571f1dd9-cf46-472e-96b4-0835b2d711a8,cf5ab327-47ee-40eb-a06c-3a8fef2a1ebb,87ca2702-d747-4c73-92ff-a8b7f30fb306,50c0b143-29b8-4060-a63e-d263ae52b029,cb0cbce4-ed6f-4819-a83d-00f7bfe400ac,f88848be-c988-4cef-805f-193477175d27,15a656bc-0f5f-4f06-80ad-0584bf873805,1c5f84e6-1571-4cb9-b7b9-bfa6b13e121e,4941a6e3-84e3-4afa-8ce4-a6bd1420a438,e841eb85-0fa6-43bc-8849-716e5fac921c
0,MIMAT0000062,1,35520,52391,12389,26478,30912,42844,10070,44201,...,21301,35814,55125,49487,18098,41477,25753,36546,141376,23366
1,MIMAT0000063,1,77667,30743,11315,28568,25114,65036,13291,108900,...,23921,66517,25669,15679,15973,55674,27100,58168,73359,29828
2,MIMAT0000064,1,1087,3026,1659,1546,626,1462,490,4832,...,2234,7658,966,2129,517,5782,1403,839,2501,814
3,MIMAT0000065,1,322,246,435,304,252,507,96,213,...,250,183,291,147,186,328,287,251,635,370
4,MIMAT0000066,1,1654,2604,1680,2146,1628,645,1591,1332,...,1643,1575,1549,1163,1129,3713,1064,950,5675,667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,MIMAT0031892,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1864,MIMAT0031893,1,4,1,10,5,1,4,4,1,...,3,5,2,0,0,2,1,3,0,2
1865,MIMAT0032026,0,3,1,2,4,0,2,1,2,...,1,0,1,0,0,0,0,0,5,8
1866,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Print the DataFrame of flagged aggregated reads of Paired Normal
mir_expression_data['paired-normal']

Unnamed: 0,accession_id,is_expressed,82bf32b4-0410-4426-9591-33210d995b98,7e17b2bf-91b2-4ea2-acc1-59e1537913ce,84d88e83-f246-45c8-a27c-5f146cb5a4b4,86723871-7176-4e6c-b9b2-dcf4acf99075,d4a3af62-0505-49f6-9f34-168bec5cb0d0,5ec7ec6c-20c8-48af-964c-54125c9313c2,803da4bd-a731-423b-b689-9c876dec117f,6c67482e-8a8a-4a5c-b8be-d573afb35d54,...,ea6c109d-9823-4fa7-987d-78f2ed377312,308d2039-4905-49de-861a-c70f97669068,12c76136-cc83-42c0-886b-4b4ea167a0d7,7c3c4540-a63e-4737-a23d-a9c999c53f39,cd746011-8fd7-473d-84e7-f39ec60b6717,600fc778-f0e3-4362-ad0c-cdb64fc76e7b,af1bb8e0-9d0f-4e54-9ca4-9b311129a412,3b8748ee-d541-4219-ba20-437311ae376a,acad335c-080a-442e-84d8-235fa3116bc6,8ca0e4c1-92ab-4653-87e0-cdcc630d8db6
0,MIMAT0000062,1,144753,201136,73447,235499,91981,79018,85993,78867,...,104284,83952,54623,76427,112570,46473,66028,40047,47492,117350
1,MIMAT0000063,1,140564,135669,54023,143943,171047,63066,65761,150187,...,52866,76506,135974,50010,85622,64125,47661,62972,78458,61340
2,MIMAT0000064,1,27161,34636,11915,34734,31835,14157,19605,33273,...,9200,18203,17332,14022,21627,10892,10181,10381,12355,18166
3,MIMAT0000065,1,527,551,482,1052,342,253,282,402,...,569,413,188,148,405,558,252,123,180,615
4,MIMAT0000066,1,4604,3050,2441,6376,1970,1733,1973,1845,...,3395,3570,1424,1261,2634,2226,2315,1073,1381,4148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1626,MIMAT0031893,1,3,6,2,16,5,2,5,3,...,3,1,2,1,1,6,1,1,1,2
1627,MIMAT0032026,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1628,MIMAT0032029,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1629,MIMAT0032110,1,20,11,7,11,8,5,6,4,...,3,9,3,4,3,3,0,2,1,3
