# Inference of MicroRNA-Messenger RNA Interactions in TCGA-BRCA
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import os
import sys

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy.stats import spearmanr

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    AGGREGATED_READS_FILES,
    BRCA_INTERIM_FILES_DIRS,
    BRCA_PROCESSED_FILES_DIRS,
    BRCA_PROCESSED_FILES_PATHS,
    INTERACTION_INFERENCE_PARAMETERS,
    MIRWALK_MIR_MAPPING_FILE_PATH,
    MIRWALK_PROCESSED_DATA_DIR,
)

# Function

In [2]:
def preparing_data_for_spearman(processed_dir_path):
    """
    Prepare normalized expression data of a cohort for Spearman correlation analysis.

    Parameters
    ----------
    processed_dir_path : str
        Path to the directory containing processed expression data files.

    Returns
    -------
    dict of pd.DataFrame
        A dictionary with keys:
        - 'mir' : DataFrame with aggregated microRNA read counts.
        - 'rna' : DataFrame with aggregated messenger RNA read counts.
        Both DataFrames have case IDs as indices and molecule identifiers as columns.
    """
    # Create a DataFrame for the files metadata
    df_files_metadata = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['files'])

    # Initialize a dictionary for the prepared DataFrames
    aggregated_norm_reads = dict()

    # Prepare the aggregated normalized reads files
    for experimental_strategy in ['mir', 'rna']:
        # Define the experimental strategy related parameters
        drop_column = ['is_expressed']
        if experimental_strategy == 'mir':
            file_name = AGGREGATED_READS_FILES['mir-norm']
            index_column = 'accession_id'
        else:
            file_name = AGGREGATED_READS_FILES['rna-norm']
            index_column = 'gene_name'
            drop_column = drop_column + ['gene_id']
        
        # Get the expressed molecules and set the primary key as index
        file_path = os.path.join(processed_dir_path, file_name)
        df_aggregated_norm_reads = pd.read_csv(file_path) \
            .query('is_expressed == 1') \
            .drop(columns=drop_column) \
            .set_index(keys=index_column)
        
        # Map the file ID to the associated case ID
        related_files = list(df_aggregated_norm_reads.columns)
        df_files_mapping = pd.DataFrame(related_files, columns=['file_id'])
        df_files_mapping = df_files_mapping \
            .merge(
                right=df_files_metadata,
                left_on='file_id',
                right_on='file_id',
                how='left',
            ) \
            [['file_id', 'case_id']]
        
        # Transpose the DataFrame and set the case ID as index
        df_aggregated_norm_reads = df_aggregated_norm_reads \
            .transpose() \
            .reset_index() \
            .rename(columns={'index': 'file_id'}) \
            .merge(
                right=df_files_mapping,
                left_on='file_id',
                right_on='file_id',
                how='inner',
            ) \
            .drop(columns=['file_id']) \
            .set_index(keys='case_id') \
            .sort_index(ascending=True)
        
        # Store the prepared DataFrame in the dictionary
        aggregated_norm_reads[experimental_strategy] = \
            df_aggregated_norm_reads.copy()
    
    return aggregated_norm_reads

In [3]:
def computing_single_pair_spearman(df_mir_expression, df_rna_expression):
    """
    Calculate the Spearman correlation coefficient between the expression profile 
    of a single microRNA and messenger RNA pair.
    
    Parameters
    ----------
    df_mir_expression : pd.Series
        Expression values of a single microRNA across samples.
    df_rna_expression : pd.Series
        Expression values of a single messenger RNA across the same samples.

    Returns
    -------
    dict
        A dictionary containing:
        - 'accession_id': microRNA accession ID.
        - 'gene_name': gene name of the messenger RNA.
        - 'correlation': Spearman correlation coefficient.
        - 'pvalue': p-value for the hypothesis test that correlation < 0.
    """
    # Calculate the Spearman correlation coefficient for the pair
    correlation, pvalue = spearmanr(
        a=df_mir_expression, # Samples of the expressed microRNA
        b=df_rna_expression, # Samples of the expressed messenger RNAs
        axis=0, # Each row is a observation, while the columns are variables
        nan_policy='propagate', # Return NaN when input contains it
        alternative='less', # The correlation is negative in miRNA-mRNA interactions
    )
    
    # Create a dictionary to represent the result
    result = {
        'accession_id': df_mir_expression.name,
        'gene_name': df_rna_expression.name,
        'correlation': correlation,
        'pvalue': pvalue,
    }
    
    return result

In [4]:
def computing_parallel_spearman(cohort):
    """
    Calculate Spearman correlation coefficients between all pairs of expressed microRNAs and 
    messenger RNAs for a given cohort using parallel processing.

    Parameter
    ---------
    cohort : str
        Name of the cohort used to locate interim and processed data directories.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the Spearman correlation results with one row per microRNA-
        messenger RNA pair and their corresponding correlation values.
    """
    # Define the cohort interim and processed directories path
    dir_base_name = (cohort.lower()).replace(' ', '-')
    interim_dir_path = BRCA_INTERIM_FILES_DIRS[dir_base_name]
    processed_dir_path = BRCA_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Prepare the expression data of the expressed molecules
    aggregated_norm_reads = preparing_data_for_spearman(processed_dir_path)
    df_mir_expression = aggregated_norm_reads['mir']
    df_rna_expression = aggregated_norm_reads['rna']
    
    # Get the sets of expressed microRNAs and messenger RNAs
    expressed_mirs = list(df_mir_expression.columns)
    expressed_rnas = list(df_rna_expression.columns)
    
    # Compute in parallel the Spearman correlation coefficient for each pair
    results = Parallel(n_jobs=-1, prefer='processes')(
        delayed(computing_single_pair_spearman)
        (df_mir_expression[mir], df_rna_expression[rna])
        for mir in expressed_mirs
        for rna in expressed_rnas
    )
    
    # Create a DataFrame for the computing results
    df_results = pd.DataFrame(results)
    
    # Store the DataFrame of Spearman correlation analysis
    df_results.to_csv(
        os.path.join(interim_dir_path, INTERACTION_INFERENCE_PARAMETERS['interim-file']),
        index=False
    )
    
    return df_results

In [None]:
def inferring_interactions(cohort, df_mirwalk_interactions):
    """
    Infer microRNA-messenger RNA interactions of interest present in the cohort based on 
    expression correlation and known microRNA-target interactions.

    Parameters
    ----------
    cohort : str
        Name of the cohort. Used to resolve file paths.
    df_mirwalk_interactions : pd.DataFrame
        DataFrame containing microRNA-target interactions of interest from miRWalk. 

    Returns
    -------
    pd.DataFrame
        DataFrame with inferred interactions.
    """
    # Define the cohort interim and processed directories path
    dir_base_name = (cohort.lower()).replace(' ', '-')
    interim_dir_path = BRCA_INTERIM_FILES_DIRS[dir_base_name]
    processed_dir_path = BRCA_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Create a DataFrame for the Spearman correlation analysis results
    df_spearman_pairs = pd.read_csv(
        os.path.join(interim_dir_path, INTERACTION_INFERENCE_PARAMETERS['interim-file'])
    )
    
    # Merge Spearman results with miRWalk interactions of interest
    df_interactions = df_spearman_pairs \
        .merge(
            right=df_mirwalk_interactions,
            left_on=['accession_id', 'gene_name'],
            right_on=['accession_id', 'gene_name'],
            how='left',
        ) \
        [[
            'accession_id',
            'mirna_name',
            'gene_name',
            'correlation',
            'pvalue',
        ]]
        
    # Flag interactions of interest, i.e. miRWalk interactions of interest that are 
    # statistically relevant in the cohort expression data
    correlation = INTERACTION_INFERENCE_PARAMETERS['correlation']
    pvalue = INTERACTION_INFERENCE_PARAMETERS['pvalue']
    df_interactions['is_interaction_of_interest'] = np.where(
        ((df_interactions['correlation'] <= correlation)
         & (df_interactions['pvalue'] <= pvalue)
         & (df_interactions['mirna_name'].notna())), 1, 0
    )
    
    # Store the DataFrame of flagged inferred interactions
    df_interactions.to_csv(
        os.path.join(processed_dir_path, INTERACTION_INFERENCE_PARAMETERS['processed-file']),
        index=False
    )
    
    return df_interactions

# Spearman Correlation Analysis

## Basal-like

In [5]:
# Compute the Spearman correlation coefficients for Basal-like tumor tissue
df_results = computing_parallel_spearman('Basal-like')

In [6]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,-0.099821,0.178811
1,MIMAT0000062,DPM1,0.066141,0.728626
2,MIMAT0000062,SCYL3,0.176059,0.948574
3,MIMAT0000062,C1orf112,-0.026828,0.402586
4,MIMAT0000062,FGR,-0.114274,0.145960
...,...,...,...,...
3569745,MIMAT0026480,AC055839.2,0.209047,0.974003
3569746,MIMAT0026480,NOTCH2NLC,0.138569,0.899723
3569747,MIMAT0026480,AP003071.5,0.066159,0.728682
3569748,MIMAT0026480,DUS4L-BCAP29,0.158554,0.928785


## HER2-enriched

In [7]:
# Compute the Spearman correlation coefficients for HER2-enriched tumor tissue
df_results = computing_parallel_spearman('HER2-enriched')

In [8]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.055024,0.656445
1,MIMAT0000062,DPM1,0.019959,0.558041
2,MIMAT0000062,SCYL3,0.136295,0.841736
3,MIMAT0000062,C1orf112,0.128640,0.827638
4,MIMAT0000062,FGR,-0.126316,0.176791
...,...,...,...,...
3615177,MIMAT0000281,AC055839.2,-0.151128,0.133106
3615178,MIMAT0000281,NOTCH2NLC,-0.140260,0.151261
3615179,MIMAT0000281,AP003071.5,0.016473,0.547957
3615180,MIMAT0000281,DUS4L-BCAP29,-0.037662,0.391435


## Luminal A

In [9]:
# Compute the Spearman correlation coefficients for Luminal A tumor tissue
df_results = computing_parallel_spearman('Luminal A')

In [10]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.004186,0.524780
1,MIMAT0000062,DPM1,-0.006052,0.464194
2,MIMAT0000062,SCYL3,0.191154,0.997915
3,MIMAT0000062,C1orf112,0.085533,0.898390
4,MIMAT0000062,FGR,0.039186,0.719751
...,...,...,...,...
3523599,MIMAT0026480,AC055839.2,0.034058,0.693528
3523600,MIMAT0026480,NOTCH2NLC,-0.010078,0.440517
3523601,MIMAT0026480,AP003071.5,0.032441,0.685047
3523602,MIMAT0026480,DUS4L-BCAP29,0.168949,0.994247


## Luminal B

In [11]:
# Compute the Spearman correlation coefficients for Luminal B tumor tissue
df_results = computing_parallel_spearman('Luminal B')

In [12]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.121078,0.906133
1,MIMAT0000062,DPM1,0.004202,0.518164
2,MIMAT0000062,SCYL3,0.225023,0.993263
3,MIMAT0000062,C1orf112,0.062984,0.752825
4,MIMAT0000062,FGR,-0.086221,0.174542
...,...,...,...,...
3462111,MIMAT0026480,AC055839.2,0.137787,0.933290
3462112,MIMAT0026480,NOTCH2NLC,0.183395,0.977519
3462113,MIMAT0026480,AP003071.5,-0.085193,0.177441
3462114,MIMAT0026480,DUS4L-BCAP29,0.206896,0.988314


## Paired Normal

In [13]:
# Compute the Spearman correlation coefficients for Paired Normal tissue
df_results = computing_parallel_spearman('Paired Normal')

In [14]:
# Print the DataFrame of Spearman results for this cohort
df_results

Unnamed: 0,accession_id,gene_name,correlation,pvalue
0,MIMAT0000062,TSPAN6,0.180451,0.908384
1,MIMAT0000062,TNMD,0.108749,0.787512
2,MIMAT0000062,DPM1,0.226521,0.953404
3,MIMAT0000062,SCYL3,0.260150,0.973588
4,MIMAT0000062,C1orf112,0.339508,0.994764
...,...,...,...,...
4140823,MIMAT0026738,AC022415.2,-0.105641,0.219204
4140824,MIMAT0026738,AC055839.2,0.072522,0.702348
4140825,MIMAT0026738,NOTCH2NLC,-0.018045,0.447491
4140826,MIMAT0026738,AP003071.5,-0.117567,0.194084


# miRWalk Interactions of Interest

In [3]:
# Initialize a DataFrame for the interactions of interest from miRWalk
df_mirwalk_interactions = pd.DataFrame(columns=['mirna_name', 'gene_name'])

# List the interaction files downloaded from miRWalk
files = [f for f in os.listdir(MIRWALK_PROCESSED_DATA_DIR) if f.startswith('MIMAT')]

# Iterate over each interactions file from miRWalk
for file in files:
    # Create a DataFrame for the interactions of interest of this microRNA
    file_path = os.path.join(MIRWALK_PROCESSED_DATA_DIR, file)    
    df_mir_interactions = pd.read_csv(file_path, low_memory=False) \
        .query('is_interaction_of_interest == 1') \
        [['mirna_name', 'gene_name']] \
        .drop_duplicates()
    
    # Concatenate the interactions of interest to the others
    df_mirwalk_interactions = pd.concat(
        [df_mirwalk_interactions, df_mir_interactions], ignore_index=True
    )

# Add the microRNA accession IDs to the DataFrame
df_mir_mapping = pd.read_csv(MIRWALK_MIR_MAPPING_FILE_PATH)
df_mirwalk_interactions = df_mir_mapping \
    .merge(
        right=df_mirwalk_interactions,
        left_on='mirna_name',
        right_on='mirna_name',
        how='inner',
    )

In [4]:
# Print the DataFrame of interactions of interest from miRWalk
df_mirwalk_interactions

Unnamed: 0,accession_id,mirna_name,gene_name
0,MIMAT0003322,hsa-miR-652-3p,KCNN3
1,MIMAT0003322,hsa-miR-652-3p,HSD3B7
2,MIMAT0003322,hsa-miR-652-3p,UBE2I
3,MIMAT0003322,hsa-miR-652-3p,SLC35C2
4,MIMAT0003322,hsa-miR-652-3p,GGCX
...,...,...,...
51125,MIMAT0000736,hsa-miR-381-3p,SLC6A17
51126,MIMAT0000736,hsa-miR-381-3p,XKR7
51127,MIMAT0000736,hsa-miR-381-3p,RFFL
51128,MIMAT0000736,hsa-miR-381-3p,DNAJB14


# Interaction Inference
According to Anticorrelation Analysis [`statistic`] and miRWalk Data [`biology`]

## Basal-like

In [None]:
# Infer the interactions of interest present in Basal-like tumor tissue
df_interactions = inferring_interactions('Basal-like', df_mirwalk_interactions)

In [None]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,-0.099821,0.178811,0
1,MIMAT0000062,,DPM1,0.066141,0.728626,0
2,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.176059,0.948574,0
3,MIMAT0000062,,C1orf112,-0.026828,0.402586,0
4,MIMAT0000062,,FGR,-0.114274,0.145960,0
...,...,...,...,...,...,...
3569745,MIMAT0026480,,AC055839.2,0.209047,0.974003,0
3569746,MIMAT0026480,,NOTCH2NLC,0.138569,0.899723,0
3569747,MIMAT0026480,,AP003071.5,0.066159,0.728682,0
3569748,MIMAT0026480,,DUS4L-BCAP29,0.158554,0.928785,0


## HER2-enriched

In [7]:
# Infer the interactions of interest present in HER2-enriched tumor tissue
df_interactions = inferring_interactions('HER2-enriched', df_mirwalk_interactions)

In [None]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.055024,0.656445,0
1,MIMAT0000062,,DPM1,0.019959,0.558041,0
2,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.136295,0.841736,0
3,MIMAT0000062,,C1orf112,0.128640,0.827638,0
4,MIMAT0000062,,FGR,-0.126316,0.176791,0
...,...,...,...,...,...,...
3615177,MIMAT0000281,,AC055839.2,-0.151128,0.133106,0
3615178,MIMAT0000281,,NOTCH2NLC,-0.140260,0.151261,0
3615179,MIMAT0000281,,AP003071.5,0.016473,0.547957,0
3615180,MIMAT0000281,,DUS4L-BCAP29,-0.037662,0.391435,0


## Luminal A

In [None]:
# Infer the interactions of interest present in Luminal A tumor tissue
df_interactions = inferring_interactions('Luminal A', df_mirwalk_interactions)

In [None]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.004186,0.524780,0
1,MIMAT0000062,,DPM1,-0.006052,0.464194,0
2,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.191154,0.997915,0
3,MIMAT0000062,,C1orf112,0.085533,0.898390,0
4,MIMAT0000062,,FGR,0.039186,0.719751,0
...,...,...,...,...,...,...
3523599,MIMAT0026480,,AC055839.2,0.034058,0.693528,0
3523600,MIMAT0026480,,NOTCH2NLC,-0.010078,0.440517,0
3523601,MIMAT0026480,,AP003071.5,0.032441,0.685047,0
3523602,MIMAT0026480,,DUS4L-BCAP29,0.168949,0.994247,0


## Luminal B

In [13]:
# Infer the interactions of interest present in Luminal B tumor tissue
df_interactions = inferring_interactions('Luminal B', df_mirwalk_interactions)

In [14]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.121078,0.906133,0
1,MIMAT0000062,,DPM1,0.004202,0.518164,0
2,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.225023,0.993263,0
3,MIMAT0000062,,C1orf112,0.062984,0.752825,0
4,MIMAT0000062,,FGR,-0.086221,0.174542,0
...,...,...,...,...,...,...
3462111,MIMAT0026480,,AC055839.2,0.137787,0.933290,0
3462112,MIMAT0026480,,NOTCH2NLC,0.183395,0.977519,0
3462113,MIMAT0026480,,AP003071.5,-0.085193,0.177441,0
3462114,MIMAT0026480,,DUS4L-BCAP29,0.206896,0.988314,0


## Paired Normal

In [15]:
# Infer the interactions of interest present in Paired Normal tissue
df_interactions = inferring_interactions('Paired Normal', df_mirwalk_interactions)

In [16]:
# Print the DataFrame of inferred interactions for this cohort
df_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,correlation,pvalue,is_interaction_of_interest
0,MIMAT0000062,,TSPAN6,0.180451,0.908384,0
1,MIMAT0000062,,TNMD,0.108749,0.787512,0
2,MIMAT0000062,,DPM1,0.226521,0.953404,0
3,MIMAT0000062,hsa-let-7a-5p,SCYL3,0.260150,0.973588,0
4,MIMAT0000062,,C1orf112,0.339508,0.994764,0
...,...,...,...,...,...,...
4140823,MIMAT0026738,,AC022415.2,-0.105641,0.219204,0
4140824,MIMAT0026738,,AC055839.2,0.072522,0.702348,0
4140825,MIMAT0026738,,NOTCH2NLC,-0.018045,0.447491,0
4140826,MIMAT0026738,,AP003071.5,-0.117567,0.194084,0
