# Web Scraping Data from miRWalk (http://mirwalk.umm.uni-heidelberg.de/)

## Install and Imports

In [1]:
!apt-get update -qq
!pip install selenium -qq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.3/448.3 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import time

from pathlib import Path
import shutil
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

## Constants

### Article

In [3]:
# MicroRNAs selected for regulatory network construction
MIRNAS = (
    'miR-221-3p',
    'miR-146b-5p',
    'miR-222-3p',
    'miR-181b-5p',
    'miR-155-5p',
    'miR-34a-5p',
    'miR-26a-5p',
    'miR-224-5p',
    'miR-138-5p',
    'miR-187-3p',
    'miR-31-5p',
    'miR-125b-5p',
    'let-7c-5p', # let-7c
    'miR-30a-5p',
    'miR-30d-5p' # miR-30d
)

# Species selected for regulatory network construction
SPECIES = 'human'

### miRWalk Database

In [4]:
# miRWalk URL
URL = 'http://mirwalk.umm.uni-heidelberg.de'

# Default file name
DEFAULT_FILE_NAME = 'miRWalk_miRNA_Targets.csv'

### Paths

In [5]:
# Data directory path
DATA_DIR_PATH = Path('/content/drive/MyDrive/MC030/_repo/data')

# External data directory path
EXTERNAL_DATA_DIR_PATH = Path(DATA_DIR_PATH / 'external')

# Processed data directory path
PROCESSED_DATA_DIR_PATH = Path(DATA_DIR_PATH / 'processed')

# miRWalk data directory path
MIRWALK_DATA_DIR_PATH = Path(EXTERNAL_DATA_DIR_PATH / 'mirwalk')

# miRWalk default downloaded file path
MIRWALK_FILE_PATH = Path(MIRWALK_DATA_DIR_PATH / DEFAULT_FILE_NAME)

## Functions

In [6]:
def create_driver(download_directory):
    # ChromeDriver configuration
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')

    # Set the download directory
    options.add_experimental_option(
        'prefs', {'download.default_directory': download_directory}
    )

    # Initiate the Chrome browser
    driver = webdriver.Chrome(options=options)

    return driver

def quit_driver(driver):
    driver.quit()

def export_mirna_targets(driver, mirna):
    # Access the provided URL
    driver.get(URL)

    # Find and fill in the species selector
    species_input = Select(driver.find_element(By.NAME, 'species'))
    species_input.select_by_visible_text(SPECIES)

    # Find and fill in the microRNA input
    mirna_input = driver.find_element(By.NAME, 'mirna')
    mirna_input.send_keys(mirna)

    # Find and click in the search button
    search_btn = driver.find_element(By.XPATH, '//button[text()="search"]')
    search_btn.click()

    # Find and click in the result export link
    export_link = driver.find_element(By.LINK_TEXT, 'Export CSV')
    export_link.click()

    # Wait until the download is complete
    while not Path(MIRWALK_FILE_PATH).exists():
        time.sleep(5)

    # Change the microRNA predicted targets file name
    mirna_file_path = Path(f'{MIRWALK_DATA_DIR_PATH}/{mirna}.csv')
    MIRWALK_FILE_PATH.rename(mirna_file_path)

    return mirna_file_path

def descriptive_analysis(df):
    # MicroRNAs
    mirnaid = df['mirnaid'].unique()
    print(f'Number of distinct miRNAs: {len(mirnaid)}\n {mirnaid}')

    # MicroRNAs targets (mRNAs)
    mrnas = df['genesymbol'].unique()
    print(f'\nNumber of distinct mRNAs: {len(mrnas)}\n {mrnas}')

    # Interactions miRNA:mRNA
    interactions = df.shape[0]
    print(f'\nNumber of interactions: {interactions}')

    # Distinct interactions miRNA:mRNA
    dist_interactions = df[['mirnaid', 'genesymbol']].drop_duplicates()
    print(f'\nNumber of distinct interactions: {dist_interactions.shape[0]}')

## Web Scraping

### Download & Data Concatenation

In [7]:
# Verify if the miRWalk data directory exists
if MIRWALK_DATA_DIR_PATH.exists():
    # Delete the directory
    shutil.rmtree(MIRWALK_DATA_DIR_PATH)

# Create the ChromeDriver and a empty DataFrame
driver = create_driver(str(MIRWALK_DATA_DIR_PATH))
df_targets = pd.DataFrame()

# Iterate over microRNAs
for mirna in MIRNAS:
    # Download the microRNA predicted targets file
    mirna_file_path = export_mirna_targets(driver, mirna)

    # Read the microRNA predicted targets file
    df_mirna = pd.read_csv(mirna_file_path)

    # Concatenate the microRNA predicted targets into a DataFrame
    df_targets = pd.concat([df_targets, df_mirna], ignore_index=True)

# Quit the ChromeDriver
quit_driver(driver)

### Concatenated DataFrame Display

In [8]:
df_targets.dtypes

mirnaid                          object
refseqid                         object
genesymbol                       object
duplex                           object
start                             int64
end                               int64
bindingp                        float64
energy                          float64
seed                              int64
accessibility                   float64
au                              float64
phylopstem                      float64
phylopflank                     float64
me                              float64
number_of_pairings                int64
binding_region_length             int64
longest_consecutive_pairings      int64
position                         object
validated                        object
TargetScan                        int64
miRDB                             int64
dtype: object

In [9]:
display(df_targets)

Unnamed: 0,mirnaid,refseqid,genesymbol,duplex,start,end,bindingp,energy,seed,accessibility,...,phylopstem,phylopflank,me,number_of_pairings,binding_region_length,longest_consecutive_pairings,position,validated,TargetScan,miRDB
0,hsa-miR-221-3p,NM_001330410,SFXN5,AGCTACATTGTCTGCTGGGTTTC#GGCCCAGTGAGAGTGAGGCC#....,3839,3859,0.846154,-21.5,0,0.000179,...,0.000000,0.000000,-10.443999,17,20,10,3UTR,,0,0
1,hsa-miR-221-3p,NM_001330410,SFXN5,AGCTACATTGTCTGCTGGGTTTC#AACCTAGGAGCAGTGGGCC#.(...,2041,2060,1.000000,-20.6,0,0.000632,...,0.000000,0.000000,-11.593976,17,19,10,3UTR,,0,0
2,hsa-miR-221-3p,NM_001330413,BCAS3,AGCTACATTGTCTGCTGGGTTTC#ACCCAGCAAATCGATGGGCGGA...,1778,1816,0.923077,-22.9,1,0.003619,...,0.125505,0.610787,-3.937574,20,38,8,CDS,,0,0
3,hsa-miR-221-3p,NM_001330414,BCAS3,AGCTACATTGTCTGCTGGGTTTC#ACCCAGCAAATCGATGGGCGGA...,1733,1771,1.000000,-22.9,1,0.003661,...,-0.217478,-0.440093,-3.937574,20,38,8,CDS,,0,0
4,hsa-miR-221-3p,NM_001330414,BCAS3,AGCTACATTGTCTGCTGGGTTTC#GACCCTGAGTGGACAGTCACAG...,3274,3298,0.846154,-19.9,0,0.000115,...,0.000000,0.000000,-9.813321,18,24,10,3UTR,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575980,hsa-miR-30d-5p,NM_001042482,TPK1,TGTAAACATCCCCGACTGGAAG#CTTTCAGATTGGGGTGTTAGAAG...,1164,1192,0.923077,-21.1,1,0.000032,...,0.000000,0.000000,-4.993964,21,28,10,3UTR,,0,0
575981,hsa-miR-30d-5p,NM_001042486,DLGAP4,TGTAAACATCCCCGACTGGAAG#CCAGTCGGGCCTGAGCAA#(((....,197,215,0.923077,-20.4,0,0.000058,...,0.000000,0.000000,-15.288489,14,18,9,5UTR,,1,1
575982,hsa-miR-30d-5p,NM_001042492,NF1,TGTAAACATCCCCGACTGGAAG#CTTCCGGTGGGGTGTCATGGCGG...,1,24,0.846154,-21.6,0,0.000030,...,0.000000,0.000000,-13.001863,18,23,15,5UTR,,1,1
575983,hsa-miR-30d-5p,NM_001042492,NF1,TGTAAACATCCCCGACTGGAAG#TCCGGTGTGGGGTGG#......(...,2488,2503,1.000000,-20.3,0,0.000057,...,0.000000,0.000000,-17.712529,12,15,6,CDS,,1,1


## Descriptive Analysis

### All Interactions

In [10]:
descriptive_analysis(df_targets)

Number of distinct miRNAs: 15
 ['hsa-miR-221-3p' 'hsa-miR-146b-5p' 'hsa-miR-222-3p' 'hsa-miR-181b-5p'
 'hsa-miR-155-5p' 'hsa-miR-34a-5p' 'hsa-miR-26a-5p' 'hsa-miR-224-5p'
 'hsa-miR-138-5p' 'hsa-miR-187-3p' 'hsa-miR-31-5p' 'hsa-miR-125b-5p'
 'hsa-let-7c-5p' 'hsa-miR-30a-5p' 'hsa-miR-30d-5p']

Number of distinct mRNAs: 19313
 ['SFXN5' 'BCAS3' 'LARP4' ... 'PRRX2' 'TMEM271' 'GNGT1']

Number of interactions: 575985

Number of distinct interactions: 171607


### "Validated" Interactions

In [11]:
# TargetScan interactions
df_targetscan = df_targets.query('TargetScan > 0').reset_index(drop=True)

descriptive_analysis(df_targetscan)

Number of distinct miRNAs: 15
 ['hsa-miR-221-3p' 'hsa-miR-146b-5p' 'hsa-miR-222-3p' 'hsa-miR-181b-5p'
 'hsa-miR-155-5p' 'hsa-miR-34a-5p' 'hsa-miR-26a-5p' 'hsa-miR-224-5p'
 'hsa-miR-138-5p' 'hsa-miR-187-3p' 'hsa-miR-31-5p' 'hsa-miR-125b-5p'
 'hsa-let-7c-5p' 'hsa-miR-30a-5p' 'hsa-miR-30d-5p']

Number of distinct mRNAs: 4600
 ['ETS1' 'HMBOX1' 'TRPS1' ... 'ITGA4' 'DIO2' 'GOLGA8B']

Number of interactions: 32645

Number of distinct interactions: 8119


In [12]:
# miRDB interactions
df_mirdb = df_targets.query('miRDB > 0').reset_index(drop=True)

descriptive_analysis(df_mirdb)

Number of distinct miRNAs: 15
 ['hsa-miR-221-3p' 'hsa-miR-146b-5p' 'hsa-miR-222-3p' 'hsa-miR-181b-5p'
 'hsa-miR-155-5p' 'hsa-miR-34a-5p' 'hsa-miR-26a-5p' 'hsa-miR-224-5p'
 'hsa-miR-138-5p' 'hsa-miR-187-3p' 'hsa-miR-31-5p' 'hsa-miR-125b-5p'
 'hsa-let-7c-5p' 'hsa-miR-30a-5p' 'hsa-miR-30d-5p']

Number of distinct mRNAs: 5238
 ['SANBR' 'ETS1' 'SLC30A6' ... 'RHEBL1' 'ITGA4' 'PATL2']

Number of interactions: 33915

Number of distinct interactions: 8441


In [13]:
# miRTarBase interactions
df_mirtarbase = df_targets.query('validated.notna()').reset_index(drop=True)

descriptive_analysis(df_mirtarbase)

Number of distinct miRNAs: 15
 ['hsa-miR-221-3p' 'hsa-miR-146b-5p' 'hsa-miR-222-3p' 'hsa-miR-181b-5p'
 'hsa-miR-155-5p' 'hsa-miR-34a-5p' 'hsa-miR-26a-5p' 'hsa-miR-224-5p'
 'hsa-miR-138-5p' 'hsa-miR-187-3p' 'hsa-miR-31-5p' 'hsa-miR-125b-5p'
 'hsa-let-7c-5p' 'hsa-miR-30a-5p' 'hsa-miR-30d-5p']

Number of distinct mRNAs: 2966
 ['ETS1' 'CXorf38' 'ATL2' ... 'SLFN5' 'MTR' 'RPS17']

Number of interactions: 15234

Number of distinct interactions: 3914


## Storage of Processed Data

In [14]:
# Create the processed data directory, if necessary
PROCESSED_DATA_DIR_PATH.mkdir(exist_ok=True)

# Save complete microRNAs targets information
complete_data_file_path = Path(PROCESSED_DATA_DIR_PATH /
                          'mirna-mrna_mirwalk_data.csv')
df_targets.to_csv(str(complete_data_file_path), index=False)

# Save complete microRNAs targets information
valid_data_file_path = Path(PROCESSED_DATA_DIR_PATH /
                          'mirna-mrna_mirwalk_mirtarbase_data.csv')
df_mirtarbase.to_csv(str(valid_data_file_path), index=False)