# **Processing Interaction Data from miRWalk**

# Importing Libraries and Configurations

In [1]:
import logging
import os
import sys
import time

import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    ElementNotInteractableException,
    NoSuchElementException,
)
from selenium.webdriver.support.ui import Select

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    AGGREGATED_READS_FILES,
    BRCA_PROCESSED_FILES_DIRS,
    MIRWALK_DEFAULT_FILE_PATH,
    MIRWALK_DOWNLOAD_PARAMETERS,
    MIRWALK_EXTERNAL_DATA_DIR,
    MIRWALK_MIR_MAPPING_FILE_PATH,
    MIRWALK_PROCESSED_DATA_DIR,
)

# Functions

In [2]:
def create_chrome_webdriver():
    """
    Initialize and configure a headless Chrome WebDriver instance for Selenium.

    Returns:
    --------
    selenium.webDriver.Chrome
        An instance of the configured Chrome WebDriver.
    
    Configuration:
    --------
        - Headless Mode: The browser runs without a GUI (`--headless`).
        - No Sandbox: Disables the Chrome sandbox for compatibility (`--no-sandbox`).
        - Dev SHM Usage: Prevents issues with shared memory in containerized 
          environments (`--disable-dev-shm-usage`).
        - GPU: Disables GPU acceleration (`--disable-gpu`).
        - Extensions: Disables browser extensions (`--disable-extensions`).
        - Download Directory: Sets the default download directory.
    """
    # Configure the Selenium ChromeDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')

    # Define the download directory
    options.add_experimental_option(
        'prefs', {'download.default_directory': MIRWALK_EXTERNAL_DATA_DIR}
    )

    # Initiate the Chrome browser
    driver = webdriver.Chrome(options=options)

    return driver

In [3]:
def exporting_mir_targets_file(driver, accession_id, max_wait_time=60):
    """
    Automate the download of microRNA target interaction data from the miRWalk website.

    Parameters:
    -----------
    driver : selenium.webdriver.Chrome
        The WebDriver instance to control the browser.
    accession_id : str
        The accession ID of the microRNA to search for.
    max_wait_time : int
        Maximum time in seconds to wait for the download to complete (default is 60).
    """
    logging.info(f'Downloading the interactions of microRNA {accession_id}...')
    
    try:
        # Access miRWalk website
        driver.get(MIRWALK_DOWNLOAD_PARAMETERS['base-url'])

        try:
            # Find and fill the species selector
            species_input = Select(driver.find_element(By.NAME, 'species'))
            species_input.select_by_visible_text(
                MIRWALK_DOWNLOAD_PARAMETERS['species-selection']
            )
        except NoSuchElementException:
            logging.error('Species selector not found.')
            return
        except ElementNotInteractableException:
            logging.error('Species selector not interactable')
            return

        try:
            # Find and fill the microRNA identifier
            mir_input = driver.find_element(By.NAME, 'mirna')
            mir_input.send_keys(accession_id)
        except NoSuchElementException:
            logging.error('MicroRNA identifier field not found')
            return
        except ElementNotInteractableException:
            logging.error('MicroRNA identifier field not interactable')
            return

        try:
            # Find and click the search button
            search_btn = driver.find_element(By.XPATH, '//button[text()="search"]')
            search_btn.click()
        except NoSuchElementException:
            logging.error('Search button not found')
            return
        except ElementNotInteractableException:
            logging.error('Search button not interactable')
            return

        try:
            # Find and click the export CSV button
            export_link = driver.find_element(By.LINK_TEXT, 'Export CSV')
            export_link.click()
        except NoSuchElementException:
            logging.error('Export CSV button not found')
            return
        except ElementNotInteractableException:
            logging.error('Export CSV button not interactable')
            return

        # Wait until the download is complete
        wait_time = 0
        while not os.path.exists(MIRWALK_DEFAULT_FILE_PATH):
            time.sleep(5)
            wait_time += 5
            if wait_time >= max_wait_time:
                logging.error('Download did not complete in the expected time')
                return

        try:
            # Change the name of the downloaded file
            mir_file_path = os.path.join(MIRWALK_EXTERNAL_DATA_DIR, f'{accession_id}.csv')
            os.rename(MIRWALK_DEFAULT_FILE_PATH, mir_file_path)
            logging.info(f'Downloaded and saved to {mir_file_path}')
        except OSError as e:
            logging.error(f'Error renaming the file: {e}')
            return

    except Exception as e:
        logging.exception(f'An unexpected error occurred: {e}')

In [4]:
def processing_mir_targets_file(file_name):
    """
    Process a microRNA target interactions file and flag interactions of interest.

    Parameters:
    -----------
    file_name : str 
        The name of the input CSV file located in the `MIRWALK_EXTERNAL_DATA_DIR` directory.

    Returns:
    --------
    str
        The complete name of the microRNA from the first record in the file.
    """
    # Create a DataFrame for the file
    df_interactions = pd.read_csv(f'{MIRWALK_EXTERNAL_DATA_DIR}/{file_name}')
    
    # Rename some DataFrame columns
    df_interactions = df_interactions \
        .rename(columns={
            'mirnaid': 'mirna_name',
            'refseqid': 'refseq_id',
            'genesymbol': 'gene_name',
            'validated': 'mirtarbase',
            'TargetScan': 'targetscan',
            'miRDB': 'mirdb',
        })
        
    # Get the microRNA complete name
    mir_name = df_interactions['mirna_name'][0]

    # Flag targets of interest, i.e. the microRNA-messenger RNA interactions
    # that occur in the 3UTR position and are predicted by TargetScan
    df_interactions['is_interaction_of_interest'] = np.where(
        ((df_interactions['position'] == '3UTR')
         & (df_interactions['targetscan'] == 1)), 1, 0
    )
    
    # Store the flagged targets file in a CSV file
    df_interactions.to_csv(f'{MIRWALK_PROCESSED_DATA_DIR}/{file_name}', index=False)
    
    return mir_name

# TCGA-BRCA Expressed MicroRNAs

In [5]:
# Initialize the set of expressed microRNAs in TCGA-BRCA
expressed_mirs = set()

# Prepare parameters for iterations
file_name = AGGREGATED_READS_FILES['mir-raw']
cohorts = list(BRCA_PROCESSED_FILES_DIRS.keys())
cohorts.remove('project')

# Get the microRNAs expressed in each cohort
for cohort in cohorts:
    # Set the path to the cohort's flagged expressed microRNAs file
    file_path = os.path.join(BRCA_PROCESSED_FILES_DIRS[cohort], file_name)
    
    # Create a set with the microRNAs expressed in the cohort
    df_expressed_mirs = pd.read_csv(file_path).query('is_expressed == 1')
    cohort_expressed_mirs = set(df_expressed_mirs['accession_id'].to_list())
    
    # Unite the sets of expressed microRNAs, general and cohort specific
    expressed_mirs = expressed_mirs.union(cohort_expressed_mirs)

In [6]:
# Print the total number of expressed microRNAs in TCGA-BRCA
print(f'Number of expressed microRNAs: {len(expressed_mirs)}')

Number of expressed microRNAs: 304


# miRWalk Data Download

In [7]:
# Create the Chrome WebDriver
driver = create_chrome_webdriver()

# Download the interactions file of each expressed microRNA
for accession_id in expressed_mirs:
    exporting_mir_targets_file(driver, accession_id)

# Close the Chrome WebDriver
driver.quit()

INFO:Downloading the interactions of microRNA MIMAT0003322...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0003322.csv
INFO:Downloading the interactions of microRNA MIMAT0004509...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0004509.csv
INFO:Downloading the interactions of microRNA MIMAT0000081...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0000081.csv
INFO:Downloading the interactions of microRNA MIMAT0004495...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0004495.csv
INFO:Downloading the interactions of microRNA MIMAT0026477...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/external/mirwalk/MIMAT0026477.csv
INFO:Downloading the interactions of microRNA MIMAT0016888...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cance

# miRWalk Data Processing

In [8]:
# Initialize a dictionary to map accession IDs to microRNA names
mir_mapping = dict.fromkeys(list(expressed_mirs))

# List the interaction files downloaded from miRWalk
files = [f for f in os.listdir(MIRWALK_EXTERNAL_DATA_DIR)]

# Process the interaction files individually
for file in files:
    # Process the file and get the microRNA name
    mir_name = processing_mir_targets_file(file)
    
    # Map the accession ID to the microRNA name
    accession_id = file.replace('.csv', '')
    mir_mapping[accession_id] = mir_name

# Create a DataFrame of the microRNA-related mapping
df_mir_mapping = pd.DataFrame.from_dict(data=mir_mapping, orient='index')
df_mir_mapping.reset_index(inplace=True)
df_mir_mapping.columns = ['accession_id', 'mirna_name']

# Store the DataFrame of the microRNA-related mapping in a CSV file
df_mir_mapping.to_csv(MIRWALK_MIR_MAPPING_FILE_PATH, index=False)

  df_interactions = pd.read_csv(f'{MIRWALK_EXTERNAL_DATA_DIR}/{file_name}')


In [9]:
# Print the DataFrame of the microRNA-related mapping
df_mir_mapping

Unnamed: 0,accession_id,mirna_name
0,MIMAT0003322,hsa-miR-652-3p
1,MIMAT0004509,hsa-miR-93-3p
2,MIMAT0000081,hsa-miR-25-3p
3,MIMAT0004495,hsa-miR-22-5p
4,MIMAT0026477,hsa-miR-128-1-5p
...,...,...
299,MIMAT0009451,hsa-miR-1976
300,MIMAT0001621,hsa-miR-369-5p
301,MIMAT0026480,hsa-miR-153-5p
302,MIMAT0000227,hsa-miR-197-3p
