# **Inference of MicroRNA-Messenger RNA Interactions for TCGA-BRCA**

This notebook produces the AT_II family of artifacts 

- TCGA: The Cancer Genome Atlas  
- BRCA: Breast Invasive Carcinoma

# Import Libraries and Configurations

In [1]:
import logging
import os
import sys
from time import sleep

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy.stats import false_discovery_control, spearmanr
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    ElementNotInteractableException,
    NoSuchElementException,
)
from selenium.webdriver.support.ui import Select

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..')))

from config import (
    EXPRESSION_FILES,
    INTERACTION_INFERENCE_SETUP,
    MIRWALK_BASE_URL,
    MIRWALK_DATA_DIRS,
    MIRWALK_FILES,
    NETWORK_DATA_DIRS,
    NETWORK_FILES,
    TCGA_DATA_DIRS,
    TCGA_FILES,
)

# miRWalk Data Download and Processing

## Functions

In [2]:
def create_chrome_webdriver():
    """
    Create and configure a headless Chrome WebDriver instance for Selenium.

    This function sets up Chrome with options suitable for automated, non-GUI
    usage, including headless mode and disabled GPU/extension features. It also
    configures a default download directory for file saving.

    Returns
    -------
    selenium.webdriver.Chrome
        A configured Chrome WebDriver instance ready for automated browsing.

    Notes
    -----
    - Runs Chrome in headless mode for use in environments without a display.
    - Disables GPU usage, extensions, and sandboxing to improve stability in
      containerized or resource-constrained environments.
    - The download directory is set to
      ``MIRWALK_DATA_DIRS['external']`` via Chrome's experimental preferences.
    - Requires the Chrome browser and a matching ChromeDriver executable to be
      installed and accessible in the system's PATH.

    Examples
    --------
    >>> driver = create_chrome_webdriver()
    >>> driver.get("https://example.com")
    >>> driver.quit()
    """
    # Configure the Selenium ChromeDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')

    # Define the download directory
    options.add_experimental_option(
        'prefs', {'download.default_directory': MIRWALK_DATA_DIRS['external']}
    )

    # Initiate the Chrome browser
    driver = webdriver.Chrome(options=options)

    return driver

In [3]:
def export_mirna_target_file(driver, accession_id, max_wait_time=300):
    """
    Download miRNA target interaction data from the miRWalk database.

    This function uses a Selenium Chrome WebDriver to navigate the miRWalk
    web interface, search for a specified microRNA by accession ID, and export
    its target interaction data as a CSV file. The downloaded file is renamed
    and saved to the configured external data directory.

    Parameters
    ----------
    driver : selenium.webdriver.Chrome
        A configured Selenium Chrome WebDriver instance (e.g., from
        :func:`create_chrome_webdriver`).
    accession_id : str
        The miRNA accession ID (e.g., `"MIMAT0000062"`) to search in miRWalk.
    max_wait_time : int, optional
        Maximum time in seconds to wait for the CSV download to complete.
        Default is 300 seconds.

    Returns
    -------
    None
        The function performs side effects: downloading, renaming, and storing
        the miRNA target CSV file.

    Notes
    -----
    - Requires internet access and a functional Selenium WebDriver with Chrome.
    - The species is selected according to
      ``INTERACTION_INFERENCE_SETUP['species-selection']``.
    - The downloaded file path is expected at ``MIRWALK_FILES['download']`` and
      is renamed to ``<accession_id>.csv`` in ``MIRWALK_DATA_DIRS['external']``.
    - Logs informational and error messages via the ``logging`` module.
    - If the download does not complete within ``max_wait_time``, the process
      is aborted.
    - The function is defensive against missing or non-interactable web elements,
      logging specific errors and stopping early when encountered.

    Examples
    --------
    >>> driver = create_chrome_webdriver()
    >>> export_mirna_target_file(driver, 'MIMAT0000062', max_wait_time=120)
    >>> driver.quit()
    """
    logging.info(f'Downloading the interactions of microRNA {accession_id}...')
    
    try:
        # Access miRWalk website
        driver.get(MIRWALK_BASE_URL)

        try:
            # Find and fill the species selector
            species_input = Select(driver.find_element(By.NAME, 'species'))
            species_input.select_by_visible_text(
                INTERACTION_INFERENCE_SETUP['species-selection']
            )
        except NoSuchElementException:
            logging.error('Species selector not found')
            return
        except ElementNotInteractableException:
            logging.error('Species selector not interactable')
            return

        try:
            # Find and fill the microRNA identifier
            mirna_input = driver.find_element(By.NAME, 'mirna')
            mirna_input.send_keys(accession_id)
        except NoSuchElementException:
            logging.error('MicroRNA identifier field not found')
            return
        except ElementNotInteractableException:
            logging.error('MicroRNA identifier field not interactable')
            return

        try:
            # Find and click the search button
            search_btn = driver.find_element(By.XPATH, '//button[text()="search"]')
            search_btn.click()
        except NoSuchElementException:
            logging.error('Search button not found')
            return
        except ElementNotInteractableException:
            logging.error('Search button not interactable')
            return

        try:
            # Find and click the export CSV button
            export_link = driver.find_element(By.LINK_TEXT, 'Export CSV')
            export_link.click()
        except NoSuchElementException:
            logging.error('Export CSV button not found')
            return
        except ElementNotInteractableException:
            logging.error('Export CSV button not interactable')
            return

        # Wait until the download is complete
        wait_time = 0
        dir_path = MIRWALK_DATA_DIRS['external']
        file_name = MIRWALK_FILES['download']
        default_file_path = os.path.join(dir_path, file_name)
        while not os.path.exists(default_file_path):
            sleep(5)
            wait_time += 5
            if wait_time >= max_wait_time:
                logging.error('Download did not complete in the expected time')
                return

        try:
            # Change the name of the downloaded file
            file_name = f'{accession_id}.csv'
            file_path = os.path.join(dir_path, file_name)
            os.rename(default_file_path, file_path)
            logging.info(f'Downloaded and saved to {file_path}')
        except OSError as e:
            logging.error(f'Error renaming the file: {e}')
            return

    except Exception as e:
        logging.exception(f'An unexpected error occurred: {e}')

In [4]:
def process_mirna_target_file(file_name):
    """
    Process a miRWalk-predicted miRNA–mRNA target interaction file.

    This function loads a CSV file of predicted miRNA–mRNA interactions
    from miRWalk, standardizes column names, flags interactions meeting
    user-defined biological criteria, and saves the processed file.
    It also returns the miRNA name associated with the file.

    Parameters
    ----------
    file_name : str
        Name of the CSV file (located in ``MIRWALK_DATA_DIRS['external']``)
        containing miRWalk-predicted interactions. Must include columns:
        - ``mirnaid`` : miRNA name or accession.
        - ``refseqid`` : RefSeq transcript ID.
        - ``genesymbol`` : mRNA gene symbol.
        - ``bindingp`` : Binding probability score.
        - ``position`` : Binding site position.
        - ``TargetScan`` : TargetScan prediction indicator.
        - Additional columns may be present.

    Returns
    -------
    str
        The complete miRNA name extracted from the file.

    Notes
    -----
    - Column names are standardized to:
      ``mirna_name``, ``refseq_id``, ``gene_name``,
      ``binding_probability``, ``binding_position``,
      ``mirtarbase``, ``targetscan``, ``mirdb``.
    - Interactions are flagged in a new column
      ``is_interaction_of_interest`` when they meet *all* of the following:
        1. Binding probability ≥
           ``INTERACTION_INFERENCE_SETUP['min-binding-probability']``.
        2. Binding position equals
           ``INTERACTION_INFERENCE_SETUP['binding-position']``.
        3. TargetScan prediction matches
           ``INTERACTION_INFERENCE_SETUP['targetscan_prediction']``.
    - The processed CSV is saved to
      ``MIRWALK_DATA_DIRS['processed']`` under the same filename.

    Examples
    --------
    >>> mirna_name = process_mirna_target_file('hsa-miR-21.csv')
    >>> print(mirna_name)
    'hsa-miR-21-5p'
    """
    # Create a DataFrame for the file
    dir_path = MIRWALK_DATA_DIRS['external']
    df_interactions = pd.read_csv(
        os.path.join(dir_path, file_name), low_memory=False
    )
    
    # Rename some DataFrame columns
    df_interactions = df_interactions \
        .rename(columns={
            'mirnaid': 'mirna_name',
            'refseqid': 'refseq_id',
            'genesymbol': 'gene_name',
            'bindingp': 'binding_probability',
            'position': 'binding_position',
            'validated': 'mirtarbase',
            'TargetScan': 'targetscan',
            'miRDB': 'mirdb',
        })
    
    # Get the microRNA complete name
    mirna_name = df_interactions['mirna_name'][0]

    # Flag targets of interest, i.e. the microRNA-messenger RNA interactions
    # that respect our parameters
    binding_position = INTERACTION_INFERENCE_SETUP['binding-position']
    min_binding_probability = INTERACTION_INFERENCE_SETUP['min-binding-probability']
    targetscan_prediction = INTERACTION_INFERENCE_SETUP['targetscan-prediction']
    df_interactions['is_interaction_of_interest'] = np.where(
        ((df_interactions['binding_probability'] >= min_binding_probability)
         & (df_interactions['binding_position'] == binding_position)
         & (df_interactions['targetscan'] == targetscan_prediction)), 1, 0
    )
    
    # Store the flagged targets file in a CSV file
    dir_path = MIRWALK_DATA_DIRS['processed']
    df_interactions.to_csv(os.path.join(dir_path, file_name), index=False)
    
    return mirna_name

## Data Download

In [5]:
# Create a DataFrame for the microRNAs in the expression data
dir_path = TCGA_DATA_DIRS['processed']['root']
file_name = EXPRESSION_FILES['expressed-mirs']
df_expressed_mirs = pd.read_csv(os.path.join(dir_path, file_name))

# Create a list with the expressed microRNA
expressed_mirs = df_expressed_mirs \
    .query('is_expressed == 1') \
    ['accession_id'].to_list()

In [6]:
# Create the Chrome WebDriver
driver = create_chrome_webdriver()

# Download the file of interactions of each expressed microRNA
for accession_id in expressed_mirs:
    export_mirna_target_file(driver, accession_id)

# Close the Chrome WebDriver
driver.quit()

INFO:Downloading the interactions of microRNA MIMAT0000062...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0000062.csv
INFO:Downloading the interactions of microRNA MIMAT0000063...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0000063.csv
INFO:Downloading the interactions of microRNA MIMAT0000064...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0000064.csv
INFO:Downloading the interactions of microRNA MIMAT0000065...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0000065.csv
INFO:Downloading the interactions of microRNA MIMAT0000066...
INFO:Downloaded and saved to /home/mylena-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0000066.csv
INFO:Downloading the interactions of microRNA MIMAT0000067...
INFO:Downloaded and saved to /home/mylena-r

## Data Processing

In [7]:
# Initialize a dictionary to map accession IDs to microRNA names
mapping = dict.fromkeys(list(expressed_mirs))

# List the interaction files downloaded from miRWalk
files = [f for f in os.listdir(MIRWALK_DATA_DIRS['external'])]

# Process the interaction files individually
for file in files:
    # Process the file and get the microRNA name
    mirna_name = process_mirna_target_file(file)
    
    # Map the accession ID to the microRNA name
    accession_id = file.replace('.csv', '')
    mapping[accession_id] = mirna_name

# Create a DataFrame of the microRNA-related mapping
df_mapping = pd.DataFrame.from_dict(data=mapping, orient='index')
df_mapping.reset_index(inplace=True)
df_mapping.columns = ['accession_id', 'mirna_name']

# Store the DataFrame of the microRNA-related mapping in a CSV file
dir_path = MIRWALK_DATA_DIRS['processed']
file_name = MIRWALK_FILES['mapping']
df_mapping.to_csv(os.path.join(dir_path, file_name), index=False)

In [8]:
# Print the DataFrame of the microRNA-related mapping
df_mapping

Unnamed: 0,accession_id,mirna_name
0,MIMAT0000062,hsa-let-7a-5p
1,MIMAT0000063,hsa-let-7b-5p
2,MIMAT0000064,hsa-let-7c-5p
3,MIMAT0000065,hsa-let-7d-5p
4,MIMAT0000066,hsa-let-7e-5p
...,...,...
431,MIMAT0026738,hsa-miR-1287-3p
432,MIMAT0027684,hsa-miR-6892-5p
433,MIMAT0030017,hsa-miR-7702
434,MIMAT0031893,hsa-miR-181b-2-3p


# Interaction Inference

## Functions

In [9]:
def prepare_data_for_spearman(group):
    """
    Prepare aggregated normalized read data for Spearman correlation analysis.

    This function loads processed normalized read counts for both miRNA-Seq and
    RNA-Seq data in a given group, filters for expressed molecules, maps file IDs
    to case IDs, and transforms the data into a format suitable for computing
    Spearman correlations (samples as rows, molecules as columns).

    Parameters
    ----------
    group : str
        The name of the group to process (e.g., `"basal-like"`).
        Must match a key in ``TCGA_DATA_DIRS['processed']``.

    Returns
    -------
    dict of {str: pandas.DataFrame}
        Dictionary mapping experimental strategies to DataFrames:
        - `'miRNA-Seq'`: Normalized read counts for expressed miRNAs.
        - `'RNA-Seq'`: Normalized read counts for expressed mRNAs.
        In each DataFrame:
        - Rows are indexed by `case_id` (case/patient IDs).
        - Columns are molecule identifiers (`accession_id` for miRNAs,
          `gene_name` for mRNAs).
        - All values are normalized read counts (floats).

    Notes
    -----
    - Only molecules flagged as expressed (`is_expressed == 1`) are included.
    - The mapping from `file_id` to `case_id` is read from
      ``TCGA_FILES['files']`` in ``TCGA_DATA_DIRS['processed']['root']``.
    - For mRNA data, `gene_id` is dropped in favor of `gene_name` as the
      identifier.
    - The output is sorted by `case_id` in ascending order.

    Examples
    --------
    >>> agg_reads = prepare_data_for_spearman('basal-like')
    >>> agg_reads['miRNA-Seq'].shape
    (50, 800)  # 50 cases, 800 expressed miRNAs
    >>> agg_reads['RNA-Seq'].shape
    (50, 15000)  # 50 cases, 15k expressed mRNAs
    """
    # Create a DataFrame for the files' metadata
    dir_path = TCGA_DATA_DIRS['processed']['root']
    file_name = TCGA_FILES['files']
    df_files = pd.read_csv(os.path.join(dir_path, file_name))

    # Initialize a dictionary for Spearman-ready DataFrames
    aggregated_reads = dict()

    # Prepare the aggregated normalized reads files
    dir_path = TCGA_DATA_DIRS['processed'][group]
    for experimental_strategy in ['miRNA-Seq', 'RNA-Seq']:
        # Define the experimental strategy related parameters
        drop_column = ['is_expressed']
        if experimental_strategy == 'miRNA-Seq':
            index_column = 'accession_id'
            file_name = EXPRESSION_FILES['agg-mirs-norm-reads']
        else:
            index_column = 'gene_name'
            drop_column = drop_column + ['gene_id']
            file_name = EXPRESSION_FILES['agg-mrnas-norm-reads']
        
        # Get the expressed molecules and set the primary key as DataFrame index
        df_agg_reads = pd.read_csv(os.path.join(dir_path, file_name)) \
            .query('is_expressed == 1') \
            .drop(columns=drop_column) \
            .set_index(keys=index_column)
        
        # Map the file ID to the associated case ID
        file_ids = list(df_agg_reads.columns)
        df_mapping = pd.DataFrame(file_ids, columns=['file_id'])
        df_mapping = df_mapping \
            .merge(
                right=df_files,
                left_on='file_id',
                right_on='file_id',
                how='left',
            ) \
            [['file_id', 'case_id']]
        
        # Transpose the DataFrame and set the case ID as DataFrame index
        df_agg_reads = df_agg_reads \
            .transpose() \
            .reset_index() \
            .rename(columns={'index': 'file_id'}) \
            .merge(
                right=df_mapping,
                left_on='file_id',
                right_on='file_id',
                how='inner',
            ) \
            .drop(columns=['file_id']) \
            .set_index(keys='case_id') \
            .sort_index(ascending=True)
        
        # Store the prepared DataFrame in the dictionary
        aggregated_reads[experimental_strategy] = df_agg_reads.copy()
    
    return aggregated_reads

In [10]:
def compute_single_pair_spearman(ser_mirna_expression, ser_mrna_expression):
    """
    Compute Spearman correlation for a single miRNA–mRNA expression pair.

    This function calculates the Spearman rank correlation coefficient between
    the expression levels of a single microRNA and a single mRNA across the
    same set of samples. The result includes the correlation value, p-value,
    and identifiers for the molecule pair.

    Parameters
    ----------
    ser_mirna_expression : pandas.Series
        Expression values for one microRNA across samples. The ``name``
        attribute should contain the miRNA's accession ID.
    ser_mrna_expression : pandas.Series
        Expression values for one mRNA across samples. The ``name`` attribute
        should contain the mRNA's gene name.

    Returns
    -------
    dict
        A dictionary with the following keys:
        - ``accession_id`` : str
            Accession ID of the microRNA.
        - ``gene_name`` : str
            Gene name of the mRNA.
        - ``correlation`` : float
            Spearman correlation coefficient.
        - ``pvalue`` : float
            Two-sided p-value for the hypothesis test.

    Notes
    -----
    - The computation uses :func:`scipy.stats.spearmanr` with parameters
      defined in ``INTERACTION_INFERENCE_SETUP``:
      - ``inference-alternative``: Specifies whether the test is one- or
        two-sided (e.g., `"less"` for negative correlation in miRNA–mRNA
        inference).
      - ``inference-axis``: Axis along which the calculation is performed.
    - The correlation is typically expected to be negative for miRNA–mRNA
      interactions.

    Examples
    --------
    >>> mirna = pd.Series([5.1, 4.8, 5.5], name='MIMAT0000062')
    >>> mrna = pd.Series([12.0, 13.5, 11.8], name='BRCA1')
    >>> compute_single_pair_spearman(mirna, mrna)
    {'accession_id': 'MIMAT0000062',
     'gene_name': 'BRCA1',
     'correlation': -0.5,
     'pvalue': 0.6666666667}
    """
    # Calculate the Spearman correlation coefficient for the pair
    alternative = INTERACTION_INFERENCE_SETUP['inference-alternative']
    axis = INTERACTION_INFERENCE_SETUP['inference-axis']
    correlation, pvalue = spearmanr(
        a=ser_mirna_expression, # Samples of the microRNA
        b=ser_mrna_expression, # Samples of the messenger RNAs
        axis=axis, # Each row is a observation, while the columns are variables
        alternative=alternative, # The correlation is negative in miRNA-mRNA inferred interactions
    )
    
    # Create a dictionary to represent the results
    results = {
        'accession_id': ser_mirna_expression.name,
        'gene_name': ser_mrna_expression.name,
        'correlation': correlation,
        'pvalue': pvalue,
    }
    
    return results

In [11]:
def infer_mirna_mrna_interactions(df_mirwalk_interactions, group):
    """
    Infer miRNA–mRNA interactions based on expression correlations and miRWalk evidence.

    This function integrates predicted interactions from the miRWalk database with
    matched expression profiles from TCGA. It selects expressed molecule pairs with
    prior evidence and computes the Spearman correlation for each, adjusting
    p-values to control the false discovery rate (FDR).

    Parameters
    ----------
    df_mirwalk_interactions : pandas.DataFrame
        DataFrame containing miRWalk-predicted miRNA–mRNA interactions.
        Must contain columns:
        - ``accession_id`` : str
            miRNA accession ID.
        - ``gene_name`` : str
            mRNA gene name.
        May also contain additional evidence columns from miRWalk.
    group : str
        The group identifier (e.g., Luminal A). Spaces will be
        replaced with hyphens and converted to lowercase for directory names.

    Returns
    -------
    pandas.DataFrame
        DataFrame of inferred interactions containing:
        - ``accession_id`` : str
            miRNA accession ID.
        - ``gene_name`` : str
            mRNA gene name.
        - ``correlation`` : float
            Spearman correlation coefficient between expression profiles.
        - ``pvalue`` : float
            Two-sided p-value from the Spearman correlation test.
        - ``qvalue`` : float
            FDR-adjusted p-value.
        - Additional columns from `df_mirwalk_interactions` (e.g., evidence sources).

    Notes
    -----
    - Only expressed miRNAs and mRNAs (as determined by prior filtering) are
      included in the analysis.
    - The function runs correlation computations in parallel using
      :class:`joblib.Parallel` with process-based execution.
    - The FDR adjustment is performed by :func:`false_discovery_control`
      using the method specified in ``INTERACTION_INFERENCE_SETUP['fdr-method']``.
    - The resulting file is saved to:
      ``NETWORK_DATA_DIRS['interim'][<group_dir>] / NETWORK_FILES['inferred-interactions']``.

    Examples
    --------
    >>> inferred = infer_mirna_mrna_interactions(mirwalk_df, 'Luminal A')
    >>> inferred.head()
      accession_id gene_name  correlation    pvalue    qvalue
    0  MIMAT0000062     BRCA1    -0.523421  0.003452  0.048912
    """
    # Define the group directory name
    group_dir = (group.lower()).replace(' ', '-')
    
    # Prepare the expression data of the expressed molecules
    aggregated_reads = prepare_data_for_spearman(group_dir)
    df_mirna_expression = aggregated_reads['miRNA-Seq']
    df_mrna_expression = aggregated_reads['RNA-Seq']
    
    # Define all potential miRNA-mRNA interactions
    df_expressed_mirnas = pd.DataFrame(
        data=list(df_mirna_expression.columns), columns=['accession_id']
    )
    df_expressed_mrnas = pd.DataFrame(
        data=list(df_mrna_expression.columns), columns=['gene_name']
    )
    df_potential_interactions = df_expressed_mirnas \
        .merge(right=df_expressed_mrnas, how='cross')
    
    # Select inferred_interactions with miRWalk retrieved evidence
    primary_key = ['accession_id', 'gene_name']
    df_inferred_interactions = df_potential_interactions \
        .merge(
            right=df_mirwalk_interactions,
            left_on=primary_key,
            right_on=primary_key,
            how='inner',
        )
    group_inferred_interactions = zip(
        df_inferred_interactions['accession_id'], 
        df_inferred_interactions['gene_name'],
    )
    
    # Compute in parallel the Spearman correlation coefficient for each pair
    results = Parallel(n_jobs=-1, prefer='processes')(
        delayed(compute_single_pair_spearman)
        (df_mirna_expression[mirna], df_mrna_expression[mrna])
        for mirna, mrna in group_inferred_interactions
    )
    
    # Create a DataFrame for the Spearman results
    df_results = pd.DataFrame(results)
    
    # Adjust the p-values to control the false discovery rate (FDR)
    axis = INTERACTION_INFERENCE_SETUP['inference-axis']
    method = INTERACTION_INFERENCE_SETUP['fdr-method']
    qvalues = false_discovery_control(
        ps=df_results['pvalue'], # The p-values to adjust
        axis=axis, # The axis along which to perform the adjustment
        method=method, # FDR control procedure
    )
    df_results['qvalue'] = qvalues
    
    # Add miRWalk data to the results
    df_results = df_mirwalk_interactions \
        .merge(
            right=df_results,
            left_on=primary_key,
            right_on=primary_key,
            how='inner',
        )
    
    # Store the DataFrame of inferred interactions in a CSV file
    dir_path = NETWORK_DATA_DIRS['interim'][group_dir]
    file_name = NETWORK_FILES['inferred-interactions']
    df_results.to_csv(os.path.join(dir_path, file_name), index=False)
    
    return df_results

## miRWalk Interactions of Interest

In [12]:
# Initialize a DataFrame for the inferred interactions of interest from miRWalk
columns = ['mirna_name', 'gene_name', 'mirtarbase']
df_mirwalk_interactions = pd.DataFrame(columns=columns)

# List the interaction files downloaded from miRWalk
dir_path = MIRWALK_DATA_DIRS['processed']
file_names = [f for f in os.listdir(dir_path) if f.startswith('MIMAT')]

# Iterate over each inferred interaction file from miRWalk
for file_name in file_names:
    # Create a DataFrame for interactions of interest of the microRNA
    file_path = os.path.join(dir_path, file_name)    
    df_interactions = pd.read_csv(file_path, low_memory=False) \
        .query('is_interaction_of_interest == 1') \
        [columns] \
        .drop_duplicates()
    
    # Concatenate the interactions of interest to the others
    df_mirwalk_interactions = pd.concat(
        [df_mirwalk_interactions, df_interactions], ignore_index=True
    )

# Add the microRNA accession IDs to the DataFrame
file_name = MIRWALK_FILES['mapping']
df_mir_mapping = pd.read_csv(os.path.join(dir_path, file_name))
df_mirwalk_interactions = df_mir_mapping \
    .merge(
        right=df_mirwalk_interactions,
        left_on='mirna_name',
        right_on='mirna_name',
        how='inner',
    ) \
    .fillna(value={'mirtarbase': ''})
    
# Create a DataFrame for the expressed messenger RNAs
dir_path = TCGA_DATA_DIRS['processed']['root']
file_name = EXPRESSION_FILES['expressed-mrnas']
df_expressed_mrnas = pd.read_csv(os.path.join(dir_path, file_name)) \
    .query('is_expressed == 1') \
    .drop(columns=['gene_id', 'is_expressed'])

# Filter the interactions between expressed molecules
df_mirwalk_interactions = df_mirwalk_interactions \
    .merge(
        right=df_expressed_mrnas,
        left_on='gene_name',
        right_on='gene_name',
        how='inner',
    )

In [13]:
# Print the DataFrame of inferred interactions of interest from miRWalk
df_mirwalk_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,mirtarbase
0,MIMAT0000062,hsa-let-7a-5p,PKIA,
1,MIMAT0000062,hsa-let-7a-5p,GJC1,
2,MIMAT0000062,hsa-let-7a-5p,PBX3,
3,MIMAT0000062,hsa-let-7a-5p,ITSN1,
4,MIMAT0000062,hsa-let-7a-5p,SRGAP1,
...,...,...,...,...
41164,MIMAT0026483,hsa-miR-370-5p,STUM,
41165,MIMAT0026483,hsa-miR-370-5p,CRISPLD2,
41166,MIMAT0026483,hsa-miR-370-5p,ADAM19,
41167,MIMAT0026557,hsa-miR-412-5p,FAM53C,


## Basal-like Interactions

In [14]:
# Infer interactions for the basal-like group
inferred_interactions = \
    infer_mirna_mrna_interactions(df_mirwalk_interactions, 'Basal-like')

In [15]:
# Print the inferred interactions associated with this group
inferred_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,mirtarbase,correlation,pvalue,qvalue
0,MIMAT0000062,hsa-let-7a-5p,PKIA,,-0.078607,0.234621,0.745015
1,MIMAT0000062,hsa-let-7a-5p,GJC1,,0.018153,0.566268,0.978192
2,MIMAT0000062,hsa-let-7a-5p,PBX3,,0.143526,0.907620,1.000000
3,MIMAT0000062,hsa-let-7a-5p,ITSN1,,-0.033079,0.380501,0.873011
4,MIMAT0000062,hsa-let-7a-5p,SRGAP1,,0.047532,0.669012,1.000000
...,...,...,...,...,...,...,...
41164,MIMAT0026483,hsa-miR-370-5p,STUM,,0.092500,0.802930,1.000000
41165,MIMAT0026483,hsa-miR-370-5p,CRISPLD2,,0.430324,0.999984,1.000000
41166,MIMAT0026483,hsa-miR-370-5p,ADAM19,,0.221807,0.980527,1.000000
41167,MIMAT0026557,hsa-miR-412-5p,FAM53C,,-0.016884,0.438326,0.911847


## HER2-enriched Interactions

In [16]:
# Infer interactions for the HER2-enriched group
inferred_interactions = \
    infer_mirna_mrna_interactions(df_mirwalk_interactions, 'HER2-enriched')

In [17]:
# Print the inferred interactions associated with this group
inferred_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,mirtarbase,correlation,pvalue,qvalue
0,MIMAT0000062,hsa-let-7a-5p,PKIA,,-0.013944,0.459379,0.871133
1,MIMAT0000062,hsa-let-7a-5p,GJC1,,0.248189,0.967434,1.000000
2,MIMAT0000062,hsa-let-7a-5p,PBX3,,0.003896,0.511368,0.894937
3,MIMAT0000062,hsa-let-7a-5p,ITSN1,,0.102802,0.774561,0.982053
4,MIMAT0000062,hsa-let-7a-5p,SRGAP1,,-0.067943,0.309403,0.789843
...,...,...,...,...,...,...,...
41164,MIMAT0026483,hsa-miR-370-5p,STUM,,0.378153,0.997971,1.000000
41165,MIMAT0026483,hsa-miR-370-5p,CRISPLD2,,0.378153,0.997971,1.000000
41166,MIMAT0026483,hsa-miR-370-5p,ADAM19,,0.262629,0.974737,1.000000
41167,MIMAT0026557,hsa-miR-412-5p,FAM53C,,0.236705,0.960494,1.000000


## Luminal A Interactions

In [18]:
# Infer interactions for the luminal A group
inferred_interactions = \
    infer_mirna_mrna_interactions(df_mirwalk_interactions, 'Luminal A')

In [19]:
# Print the inferred interactions associated with this group
inferred_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,mirtarbase,correlation,pvalue,qvalue
0,MIMAT0000062,hsa-let-7a-5p,PKIA,,-0.010777,0.436428,0.841128
1,MIMAT0000062,hsa-let-7a-5p,GJC1,,-0.043499,0.259064,0.648037
2,MIMAT0000062,hsa-let-7a-5p,PBX3,,0.059124,0.810222,1.000000
3,MIMAT0000062,hsa-let-7a-5p,ITSN1,,0.061249,0.818681,1.000000
4,MIMAT0000062,hsa-let-7a-5p,SRGAP1,,-0.035360,0.299708,0.697952
...,...,...,...,...,...,...,...
41164,MIMAT0026483,hsa-miR-370-5p,STUM,,0.143357,0.983815,1.000000
41165,MIMAT0026483,hsa-miR-370-5p,CRISPLD2,,0.456634,1.000000,1.000000
41166,MIMAT0026483,hsa-miR-370-5p,ADAM19,,0.252987,0.999933,1.000000
41167,MIMAT0026557,hsa-miR-412-5p,FAM53C,,0.017501,0.602526,0.970287


## Luminal B Interactions

In [20]:
# Infer interactions for the luminal B group
inferred_interactions = \
    infer_mirna_mrna_interactions(df_mirwalk_interactions, 'Luminal B')

In [21]:
# Print the inferred interactions associated with this group
inferred_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,mirtarbase,correlation,pvalue,qvalue
0,MIMAT0000062,hsa-let-7a-5p,PKIA,,-0.162456,0.038132,0.262916
1,MIMAT0000062,hsa-let-7a-5p,GJC1,,0.096166,0.851953,1.000000
2,MIMAT0000062,hsa-let-7a-5p,PBX3,,0.064838,0.759150,0.995618
3,MIMAT0000062,hsa-let-7a-5p,ITSN1,,0.149260,0.948140,1.000000
4,MIMAT0000062,hsa-let-7a-5p,SRGAP1,,0.173825,0.971198,1.000000
...,...,...,...,...,...,...,...
41164,MIMAT0026483,hsa-miR-370-5p,STUM,,0.076256,0.796110,1.000000
41165,MIMAT0026483,hsa-miR-370-5p,CRISPLD2,,0.317266,0.999793,1.000000
41166,MIMAT0026483,hsa-miR-370-5p,ADAM19,,0.141708,0.938692,1.000000
41167,MIMAT0026557,hsa-miR-412-5p,FAM53C,,-0.070633,0.221659,0.598784


## Paired Normal Interactions

In [22]:
# Infer interactions for the paired normal group
inferred_interactions = \
    infer_mirna_mrna_interactions(df_mirwalk_interactions, 'Paired Normal')

In [23]:
# Print the inferred interactions associated with this group
inferred_interactions

Unnamed: 0,accession_id,mirna_name,gene_name,mirtarbase,correlation,pvalue,qvalue
0,MIMAT0000062,hsa-let-7a-5p,PKIA,,0.237936,0.961289,1.000000
1,MIMAT0000062,hsa-let-7a-5p,GJC1,,-0.015379,0.455214,0.898533
2,MIMAT0000062,hsa-let-7a-5p,PBX3,,0.007314,0.521332,0.969059
3,MIMAT0000062,hsa-let-7a-5p,ITSN1,,-0.077102,0.286103,0.674410
4,MIMAT0000062,hsa-let-7a-5p,SRGAP1,,0.162748,0.884626,1.000000
...,...,...,...,...,...,...,...
41164,MIMAT0026483,hsa-miR-370-5p,STUM,,0.198770,0.929030,1.000000
41165,MIMAT0026483,hsa-miR-370-5p,CRISPLD2,,0.176897,0.903923,1.000000
41166,MIMAT0026483,hsa-miR-370-5p,ADAM19,,0.176555,0.903485,1.000000
41167,MIMAT0026557,hsa-miR-412-5p,FAM53C,,0.143814,0.854845,1.000000
