# Inference of MicroRNA-Messenger RNA Interactions in TCGA-BRCA
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import os
import sys

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy.stats import spearmanr

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    AGGREGATED_READS_FILES,
    BRCA_INTERIM_FILES_DIRS,
    BRCA_PROCESSED_FILES_DIRS,
    BRCA_PROCESSED_FILES_PATHS,
    INTERACTION_INFERENCE_PARAMETERS,
    MIRWALK_MIR_MAPPING_FILE_PATH,
    MIRWALK_PROCESSED_DATA_DIR,
)

# Function

In [None]:
def preparing_data_for_spearman(processed_dir_path):
    """
    Prepare aggregated expression data of a cohort for Spearman correlation analysis.

    Parameters
    ----------
    processed_dir_path : str
        Path to the directory containing processed expression data files.

    Returns
    -------
    dict of pd.DataFrame
        A dictionary with keys:
        - 'mir' : DataFrame with aggregated microRNA read counts.
        - 'rna' : DataFrame with aggregated messenger RNA read counts.
        Both DataFrames have case IDs as indices and molecule identifiers as columns.
    """
    # Create a DataFrame for the files metadata
    df_files_metadata = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['files'])

    # Initialize a dictionary for the prepared DataFrames
    aggregated_reads = dict()

    # Prepare the aggregated reads files
    for experimental_strategy in ['mir', 'rna']:
        # Define the experimental strategy related parameters
        drop_column = ['is_expressed']
        if experimental_strategy == 'mir':
            file_name = AGGREGATED_READS_FILES['mir']
            index_column = 'accession_id'
        else:
            file_name = AGGREGATED_READS_FILES['rna']
            index_column = 'gene_name'
            drop_column = drop_column + ['gene_id']
        
        # Get the expressed molecules and set the primary key as index
        file_path = os.path.join(processed_dir_path, file_name)
        df_aggregated_reads = pd.read_csv(file_path) \
            .query('is_expressed == 1') \
            .drop(columns=drop_column) \
            .set_index(keys=index_column)
        
        # Map the file ID to the associated case ID
        related_files = list(df_aggregated_reads.columns)
        df_files_mapping = pd.DataFrame(related_files, columns=['file_id'])
        df_files_mapping = df_files_mapping \
            .merge(
                right=df_files_metadata,
                left_on='file_id',
                right_on='file_id',
                how='left',
            ) \
            [['file_id', 'case_id']]
        
        # Transpose the DataFrame and set the case ID as index
        df_aggregated_reads = df_aggregated_reads \
            .transpose() \
            .reset_index() \
            .rename(columns={'index': 'file_id'}) \
            .merge(
                right=df_files_mapping,
                left_on='file_id',
                right_on='file_id',
                how='inner',
            ) \
            .drop(columns=['file_id']) \
            .set_index(keys='case_id') \
            .sort_index(ascending=True)
        
        # Store the prepared DataFrame in the dictionary
        aggregated_reads[experimental_strategy] = \
            df_aggregated_reads.copy()
    
    return aggregated_reads

In [3]:
def computing_single_pair_spearman(df_mir_expression, df_rna_expression):
    """
    Calculate the Spearman correlation coefficient between the expression profile 
    of a single microRNA and messenger RNA pair.
    
    Parameters
    ----------
    df_mir_expression : pd.Series
        Expression values of a single microRNA across samples.
    df_rna_expression : pd.Series
        Expression values of a single messenger RNA across the same samples.

    Returns
    -------
    dict
        A dictionary containing:
        - 'accession_id': microRNA accession ID.
        - 'gene_name': gene name of the messenger RNA.
        - 'correlation': Spearman correlation coefficient.
        - 'pvalue': p-value for the hypothesis test that correlation < 0.
    """
    # Calculate the Spearman correlation coefficient for the pair
    correlation, pvalue = spearmanr(
        a=df_mir_expression, # Samples of the expressed microRNA
        b=df_rna_expression, # Samples of the expressed messenger RNAs
        axis=0, # Each row is a observation, while the columns are variables
        nan_policy='propagate', # Return NaN when input contains it
        alternative='less', # The correlation is negative in miRNA-mRNA interactions
    )
    
    # Create a dictionary to represent the result
    result = {
        'accession_id': df_mir_expression.name,
        'gene_name': df_rna_expression.name,
        'correlation': correlation,
        'pvalue': pvalue,
    }
    
    return result

In [4]:
def computing_parallel_spearman(cohort):
    """
    Calculate Spearman correlation coefficients between all pairs of expressed microRNAs and 
    messenger RNAs for a given cohort using parallel processing.

    Parameter
    ---------
    cohort : str
        Name of the cohort used to locate interim and processed data directories.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the Spearman correlation results with one row per microRNA-
        messenger RNA pair and their corresponding correlation values.
    """
    # Define the cohort interim and processed directories path
    dir_base_name = (cohort.lower()).replace(' ', '-')
    interim_dir_path = BRCA_INTERIM_FILES_DIRS[dir_base_name]
    processed_dir_path = BRCA_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Prepare the expression data of the expressed molecules
    aggregated_norm_reads = preparing_data_for_spearman(processed_dir_path)
    df_mir_expression = aggregated_norm_reads['mir']
    df_rna_expression = aggregated_norm_reads['rna']
    
    # Get the sets of expressed microRNAs and messenger RNAs
    expressed_mirs = list(df_mir_expression.columns)
    expressed_rnas = list(df_rna_expression.columns)
    
    # Compute in parallel the Spearman correlation coefficient for each pair
    results = Parallel(n_jobs=-1, prefer='processes')(
        delayed(computing_single_pair_spearman)
        (df_mir_expression[mir], df_rna_expression[rna])
        for mir in expressed_mirs
        for rna in expressed_rnas
    )
    
    # Create a DataFrame for the computing results
    df_results = pd.DataFrame(results)
    
    # Store the DataFrame of Spearman correlation analysis
    df_results.to_csv(
        os.path.join(interim_dir_path, INTERACTION_INFERENCE_PARAMETERS['interim-file']),
        index=False
    )
    
    return df_results

In [5]:
def inferring_interactions(cohort, df_mirwalk_interactions):
    """
    Infer microRNA-messenger RNA interactions of interest present in the cohort based on 
    expression correlation and known microRNA-target interactions.

    Parameters
    ----------
    cohort : str
        Name of the cohort. Used to resolve file paths.
    df_mirwalk_interactions : pd.DataFrame
        DataFrame containing microRNA-target interactions of interest from miRWalk. 

    Returns
    -------
    pd.DataFrame
        DataFrame with inferred interactions.
    """
    # Define the cohort interim and processed directories path
    dir_base_name = (cohort.lower()).replace(' ', '-')
    interim_dir_path = BRCA_INTERIM_FILES_DIRS[dir_base_name]
    processed_dir_path = BRCA_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Create a DataFrame for the Spearman correlation analysis results
    df_spearman_pairs = pd.read_csv(
        os.path.join(interim_dir_path, INTERACTION_INFERENCE_PARAMETERS['interim-file'])
    )
    
    # Merge Spearman results with miRWalk interactions of interest
    df_interactions = df_spearman_pairs \
        .merge(
            right=df_mirwalk_interactions,
            left_on=['accession_id', 'gene_name'],
            right_on=['accession_id', 'gene_name'],
            how='left',
        ) \
        [[
            'accession_id',
            'mirna_name',
            'gene_name',
            'correlation',
            'pvalue',
        ]]
        
    # Flag interactions of interest, i.e. miRWalk interactions of interest that are 
    # statistically relevant in the cohort expression data
    correlation = INTERACTION_INFERENCE_PARAMETERS['correlation']
    pvalue = INTERACTION_INFERENCE_PARAMETERS['pvalue']
    df_interactions['is_interaction_of_interest'] = np.where(
        ((df_interactions['correlation'] <= correlation)
         & (df_interactions['pvalue'] <= pvalue)
         & (df_interactions['mirna_name'].notna())), 1, 0
    )
    
    # Store the DataFrame of flagged inferred interactions
    df_interactions.to_csv(
        os.path.join(processed_dir_path, INTERACTION_INFERENCE_PARAMETERS['processed-file']),
        index=False
    )
    
    return df_interactions

# Spearman Correlation Analysis

## Basal-like Pairs

In [6]:
# Compute the Spearman correlation coefficients for Basal-like tumor tissue
df_results = computing_parallel_spearman('Basal-like')

In [7]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.014164,0.552406
1,MIMAT0000062,DPM1,0.103227,0.832137
2,MIMAT0000062,SCYL3,0.174415,0.948946
3,MIMAT0000062,C1orf112,-0.046442,0.332808
4,MIMAT0000062,FGR,-0.046885,0.331310
...,...,...,...,...
3569745,MIMAT0026480,AC055839.2,0.174237,0.948769
3569746,MIMAT0026480,NOTCH2NLC,0.092978,0.806927
3569747,MIMAT0026480,AP003071.5,0.053665,0.691278
3569748,MIMAT0026480,DUS4L-BCAP29,0.054552,0.694184


## HER2-enriched Pairs

In [8]:
# Compute the Spearman correlation coefficients for HER2-enriched tumor tissue
df_results = computing_parallel_spearman('HER2-enriched')

In [9]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.141776,0.855792
1,MIMAT0000062,DPM1,0.085669,0.738721
2,MIMAT0000062,SCYL3,0.071457,0.702996
3,MIMAT0000062,C1orf112,0.103817,0.780989
4,MIMAT0000062,FGR,0.031932,0.594042
...,...,...,...,...
3615177,MIMAT0000281,AC055839.2,0.001000,0.502973
3615178,MIMAT0000281,NOTCH2NLC,-0.019388,0.442570
3615179,MIMAT0000281,AP003071.5,0.147675,0.865692
3615180,MIMAT0000281,DUS4L-BCAP29,0.099720,0.771795


## Luminal A

In [10]:
# Compute the Spearman correlation coefficients for Luminal A tumor tissue
df_results = computing_parallel_spearman('Luminal A')

In [11]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.068307,0.846155
1,MIMAT0000062,DPM1,-0.080750,0.113816
2,MIMAT0000062,SCYL3,0.151580,0.988523
3,MIMAT0000062,C1orf112,0.039645,0.722938
4,MIMAT0000062,FGR,0.042868,0.738827
...,...,...,...,...
3523599,MIMAT0026480,AC055839.2,0.127504,0.971915
3523600,MIMAT0026480,NOTCH2NLC,0.011821,0.569984
3523601,MIMAT0026480,AP003071.5,0.068363,0.846355
3523602,MIMAT0026480,DUS4L-BCAP29,0.149410,0.987495


## Luminal B

In [12]:
# Compute the Spearman correlation coefficients for Luminal B tumor tissue
df_results = computing_parallel_spearman('Luminal B')

In [13]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.244025,0.996622
1,MIMAT0000062,DPM1,0.102089,0.868414
2,MIMAT0000062,SCYL3,0.252434,0.997485
3,MIMAT0000062,C1orf112,0.162236,0.962898
4,MIMAT0000062,FGR,-0.045219,0.310453
...,...,...,...,...
3462111,MIMAT0026480,AC055839.2,0.153995,0.954824
3462112,MIMAT0026480,NOTCH2NLC,0.162157,0.962826
3462113,MIMAT0026480,AP003071.5,-0.077601,0.197777
3462114,MIMAT0026480,DUS4L-BCAP29,0.171411,0.970474


## Paired Normal

In [14]:
# Compute the Spearman correlation coefficients for Paired Normal tissue
df_results = computing_parallel_spearman('Paired Normal')

In [15]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.342090,0.995710
1,MIMAT0000062,TNMD,0.134271,0.842521
2,MIMAT0000062,DPM1,0.385684,0.998605
3,MIMAT0000062,SCYL3,0.353041,0.996719
4,MIMAT0000062,C1orf112,0.406189,0.999221
...,...,...,...,...
4140823,MIMAT0026738,AC022415.2,-0.026817,0.420809
4140824,MIMAT0026738,AC055839.2,-0.134667,0.156760
4140825,MIMAT0026738,NOTCH2NLC,0.002587,0.507687
4140826,MIMAT0026738,AP003071.5,-0.169235,0.102045


# miRWalk Interactions of Interest

In [16]:
# Initialize a DataFrame for the interactions of interest from miRWalk
df_mirwalk_interactions = pd.DataFrame(columns=['mirna_name', 'gene_name'])

# List the interaction files downloaded from miRWalk
files = [f for f in os.listdir(MIRWALK_PROCESSED_DATA_DIR) if f.startswith('MIMAT')]

# Iterate over each interactions file from miRWalk
for file in files:
    # Create a DataFrame for the interactions of interest of this microRNA
    file_path = os.path.join(MIRWALK_PROCESSED_DATA_DIR, file)    
    df_mir_interactions = pd.read_csv(file_path, low_memory=False) \
        .query('is_interaction_of_interest == 1') \
        [['mirna_name', 'gene_name']] \
        .drop_duplicates()
    
    # Concatenate the interactions of interest to the others
    df_mirwalk_interactions = pd.concat(
        [df_mirwalk_interactions, df_mir_interactions], ignore_index=True
    )

# Add the microRNA accession IDs to the DataFrame
df_mir_mapping = pd.read_csv(MIRWALK_MIR_MAPPING_FILE_PATH)
df_mirwalk_interactions = df_mir_mapping \
    .merge(
        right=df_mirwalk_interactions,
        left_on='mirna_name',
        right_on='mirna_name',
        how='inner',
    )

In [17]:
# Print the DataFrame of interactions of interest from miRWalk
df_mirwalk_interactions

Unnamed: 0,accession_id,mirna_name,gene_name
0,MIMAT0003322,hsa-miR-652-3p,KCNN3
1,MIMAT0003322,hsa-miR-652-3p,HSD3B7
2,MIMAT0003322,hsa-miR-652-3p,UBE2I
3,MIMAT0003322,hsa-miR-652-3p,SLC35C2
4,MIMAT0003322,hsa-miR-652-3p,GGCX
...,...,...,...
51125,MIMAT0000736,hsa-miR-381-3p,SLC6A17
51126,MIMAT0000736,hsa-miR-381-3p,XKR7
51127,MIMAT0000736,hsa-miR-381-3p,RFFL
51128,MIMAT0000736,hsa-miR-381-3p,DNAJB14


# Interaction Inference
According to Anticorrelation Analysis [`statistic`] and miRWalk Data [`biology`]

## Basal-like

In [18]:
# Infer the interactions of interest present in Basal-like tumor tissue
df_interactions = inferring_interactions('Basal-like', df_mirwalk_interactions)

In [19]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.014164,0.552406,0
1,MIMAT0000062,,DPM1,0.103227,0.832137,0
2,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.174415,0.948946,0
3,MIMAT0000062,,C1orf112,-0.046442,0.332808,0
4,MIMAT0000062,,FGR,-0.046885,0.331310,0
...,...,...,...,...,...,...
3569745,MIMAT0026480,,AC055839.2,0.174237,0.948769,0
3569746,MIMAT0026480,,NOTCH2NLC,0.092978,0.806927,0
3569747,MIMAT0026480,,AP003071.5,0.053665,0.691278,0
3569748,MIMAT0026480,,DUS4L-BCAP29,0.054552,0.694184,0


## HER2-enriched

In [20]:
# Infer the interactions of interest present in HER2-enriched tumor tissue
df_interactions = inferring_interactions('HER2-enriched', df_mirwalk_interactions)

In [21]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.141776,0.855792,0
1,MIMAT0000062,,DPM1,0.085669,0.738721,0
2,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.071457,0.702996,0
3,MIMAT0000062,,C1orf112,0.103817,0.780989,0
4,MIMAT0000062,,FGR,0.031932,0.594042,0
...,...,...,...,...,...,...
3615177,MIMAT0000281,,AC055839.2,0.001000,0.502973,0
3615178,MIMAT0000281,,NOTCH2NLC,-0.019388,0.442570,0
3615179,MIMAT0000281,,AP003071.5,0.147675,0.865692,0
3615180,MIMAT0000281,,DUS4L-BCAP29,0.099720,0.771795,0


## Luminal A

In [22]:
# Infer the interactions of interest present in Luminal A tumor tissue
df_interactions = inferring_interactions('Luminal A', df_mirwalk_interactions)

In [23]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.068307,0.846155,0
1,MIMAT0000062,,DPM1,-0.080750,0.113816,0
2,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.151580,0.988523,0
3,MIMAT0000062,,C1orf112,0.039645,0.722938,0
4,MIMAT0000062,,FGR,0.042868,0.738827,0
...,...,...,...,...,...,...
3523599,MIMAT0026480,,AC055839.2,0.127504,0.971915,0
3523600,MIMAT0026480,,NOTCH2NLC,0.011821,0.569984,0
3523601,MIMAT0026480,,AP003071.5,0.068363,0.846355,0
3523602,MIMAT0026480,,DUS4L-BCAP29,0.149410,0.987495,0


## Luminal B

In [24]:
# Infer the interactions of interest present in Luminal B tumor tissue
df_interactions = inferring_interactions('Luminal B', df_mirwalk_interactions)

In [25]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.244025,0.996622,0
1,MIMAT0000062,,DPM1,0.102089,0.868414,0
2,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.252434,0.997485,0
3,MIMAT0000062,,C1orf112,0.162236,0.962898,0
4,MIMAT0000062,,FGR,-0.045219,0.310453,0
...,...,...,...,...,...,...
3462111,MIMAT0026480,,AC055839.2,0.153995,0.954824,0
3462112,MIMAT0026480,,NOTCH2NLC,0.162157,0.962826,0
3462113,MIMAT0026480,,AP003071.5,-0.077601,0.197777,0
3462114,MIMAT0026480,,DUS4L-BCAP29,0.171411,0.970474,0


## Paired Normal

In [26]:
# Infer the interactions of interest present in Paired Normal tissue
df_interactions = inferring_interactions('Paired Normal', df_mirwalk_interactions)

In [27]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.342090,0.995710,0
1,MIMAT0000062,,TNMD,0.134271,0.842521,0
2,MIMAT0000062,,DPM1,0.385684,0.998605,0
3,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.353041,0.996719,0
4,MIMAT0000062,,C1orf112,0.406189,0.999221,0
...,...,...,...,...,...,...
4140823,MIMAT0026738,,AC022415.2,-0.026817,0.420809,0
4140824,MIMAT0026738,,AC055839.2,-0.134667,0.156760,0
4140825,MIMAT0026738,,NOTCH2NLC,0.002587,0.507687,0
4140826,MIMAT0026738,,AP003071.5,-0.169235,0.102045,0
