# Web Scraping do miRWalk
Página web: http://mirwalk.umm.uni-heidelberg.de/

# Importação das Bibliotecas

In [1]:
import time

from pathlib import Path
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

# Constantes

## API

In [2]:
# URL do miRWalk
URL = 'http://mirwalk.umm.uni-heidelberg.de'

# Nome padrão do arquivo CSV baixado do miRWalk
DEFAULT_FILE_NAME = 'miRWalk_miRNA_Targets.csv'

## Pastas de Dados

In [3]:
# Caminho da pasta raiz de dados
DATA_PATH = Path.cwd().parent / 'data'
DATA_PATH.mkdir(parents=True, exist_ok=True)

# Caminho da pasta de dados externos
EXTERNAL_DATA_PATH = DATA_PATH / 'external'
EXTERNAL_DATA_PATH.mkdir(parents=True, exist_ok=True)

# Caminho da pasta de dados intermediários
INTERIM_DATA_PATH = DATA_PATH / 'interim'
INTERIM_DATA_PATH.mkdir(parents=True, exist_ok=True)

# Caminho padrão do arquivo baixado do miRWalk
MIRWALK_FILE_PATH = EXTERNAL_DATA_PATH / DEFAULT_FILE_NAME

## Parâmetros

In [4]:
# Espécie a ser considerada nas buscas
SPECIES = 'human'

# Tupla de microRNAs derivados da região DLK1-DIO3 que estão associados ao
# câncer de rim; isto é, estão diferencialmente expressos nesse câncer de
# acordo com os dados do projeto KIRP do The Cancer Genome Atlas
MIRNAS = (
    'miR-495-3p',
    'miR-485-5p',
    'miR-665',
    'miR-539-5p',
    'miR-543',
    'miR-654-5p',
    'miR-370-3p',
    'miR-544a',
    'miR-541-3p',
    'miR-432-5p',
    'miR-381-3p',
    'miR-656-3p',
    'miR-377-3p',
    'miR-655-3p',
    'miR-410-3p',
    'miR-329-3p',
    'miR-494-3p',
    'miR-493-5p',
    'miR-134-5p',
    'miR-127-5p',
    'miR-382-5p',
    'miR-369-3p',
    'miR-758-3p',
    'miR-323a-3p',
    'miR-409-3p',
    'miR-136-5p',
    'miR-493-3p',
    'miR-299-3p',
    'miR-433-3p',
    'miR-380-3p',
    'miR-668-3p',
    'miR-1197',
    'miR-770-5p',
    'miR-431-5p',
    'miR-654-3p',
    'miR-376b-3p',
    'miR-376c-3p',
    'miR-154-5p',
    'miR-485-3p',
    'miR-487a-3p',
    'miR-889-3p', # miR-8898-3p?
    'miR-541-5p',
    'miR-376a-3p',
    'miR-496'
)

# Funções

In [5]:
def create_driver(download_directory):
    # Configuração do ChromeDriver do Selenium
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')

    # Define o diretório de download de acordo com o parâmetro
    options.add_experimental_option(
        'prefs', {'download.default_directory': download_directory}
    )

    # Inicia o browser Chrome
    driver = webdriver.Chrome(options=options)

    return driver

def quit_driver(driver):
    # Finaliza o browser Chrome
    driver.quit()

def export_mirna_targets(driver, mirna):
    # Acessa a URL requisitada
    driver.get(URL)

    # Encontra e preenche o seletor de espécie
    species_input = Select(driver.find_element(By.NAME, 'species'))
    species_input.select_by_visible_text(SPECIES)

    # Encontra e preenche o identificador do microRNA
    mirna_input = driver.find_element(By.NAME, 'mirna')
    mirna_input.send_keys(mirna)

    # Encontra e clica no botão de busca
    search_btn = driver.find_element(By.XPATH, '//button[text()="search"]')
    search_btn.click()

    # Encontra e clica no botão que exporta o resultado em um arquivo CSV
    export_link = driver.find_element(By.LINK_TEXT, 'Export CSV')
    export_link.click()

    # Espera até o download estar concluído
    while not Path(MIRWALK_FILE_PATH).exists():
        time.sleep(5)

    # Muda o nome do arquivo baixado
    mirna_file_path = Path(EXTERNAL_DATA_PATH / f'{mirna}.csv')
    MIRWALK_FILE_PATH.rename(mirna_file_path)

    return mirna_file_path

# Download dos Arquivos

In [6]:
# Cria o ChromeDriver e um DataFrame vazio
driver = create_driver(str(EXTERNAL_DATA_PATH))
df_interactions = pd.DataFrame()

# Itera sobre a tupla de microRNAs
for mirna in MIRNAS:
    # Baixa o arquivo CSV com os alvos do microRNA
    mirna_file_path = export_mirna_targets(driver, mirna)

    # Lê o arquivo CSV com os alvos do microRNA
    df_mirna_targets = pd.read_csv(mirna_file_path)

    # Concatena os dados baixados em um DataFrame
    df_interactions = pd.concat(
        objs=[df_interactions, df_mirna_targets], ignore_index=True
    )

# Finaliza o ChromeDriver
quit_driver(driver)

# Salva um arquivo com todas as interações mineradas
df_interactions.to_csv(
    path_or_buf=INTERIM_DATA_PATH / 'miRWalk-interactions.csv', index=False
)

In [7]:
# Imprime o DataFrame com todas as interações mineradas
df_interactions

Unnamed: 0,mirnaid,refseqid,genesymbol,duplex,start,end,bindingp,energy,seed,accessibility,...,phylopstem,phylopflank,me,number_of_pairings,binding_region_length,longest_consecutive_pairings,position,validated,TargetScan,miRDB
0,hsa-miR-495-3p,NM_001330416,ST8SIA2,AAACAAACATGGTGCACTTCTT#GTGCACCGTT#........((((...,3595,3605,0.846154,-15.3,0,0.029613,...,4.187171,5.161961,-19.423880,9,10,9,3UTR,,1,1
1,hsa-miR-495-3p,NM_001330420,LUC7L,AAACAAACATGGTGCACTTCTT#AGAAGTGTACAGAGTTGCTCCTG...,2462,2507,0.846154,-16.4,0,0.004972,...,4.368170,5.301501,-9.076038,18,45,10,3UTR,,0,0
2,hsa-miR-495-3p,NM_001330425,NAGK,AAACAAACATGGTGCACTTCTT#GTGACACTATGTGTTGTG#..((...,1699,1717,0.846154,-16.7,1,0.000437,...,5.845809,5.747344,-4.957001,15,18,8,3UTR,,0,0
3,hsa-miR-495-3p,NM_001330437,PTPN11,AAACAAACATGGTGCACTTCTT#AGGAGGTGCACCATA#..........,1495,1510,0.846154,-22.0,0,0.001344,...,5.191274,4.818038,-19.423880,14,15,14,CDS,,0,0
4,hsa-miR-495-3p,NM_001330438,DDX25,AAACAAACATGGTGCACTTCTT#GAGAAGTGTATCCAGATACTTTG...,3709,3735,0.884615,-18.7,1,0.000172,...,-0.342248,0.030155,-7.163349,19,26,10,3UTR,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1449871,hsa-miR-496,NM_001042364,PTPN20,TGAGTATTACATGGCCAATCTC#GAGAACTAGATCAGTTGGCTCAG...,309,352,1.000000,-22.5,1,0.119292,...,0.000000,0.000000,-5.180081,20,23,15,CDS,,0,0
1449872,hsa-miR-496,NM_001042440,CAST,TGAGTATTACATGGCCAATCTC#GGCCAGATGATGCTA#..(((((...,852,867,1.000000,-18.0,1,0.001333,...,0.000000,0.000000,-9.024282,12,15,7,CDS,,0,0
1449873,hsa-miR-496,NM_001042442,CAST,TGAGTATTACATGGCCAATCTC#GGCCAGATGATGCTA#..(((((...,909,924,1.000000,-18.0,1,0.001333,...,0.000000,0.000000,-9.024282,12,15,7,CDS,,0,0
1449874,hsa-miR-496,NM_001042443,CAST,TGAGTATTACATGGCCAATCTC#GGCCAGATGATGCTA#..(((((...,828,843,1.000000,-18.0,1,0.001333,...,0.000000,0.000000,-9.024282,12,15,7,CDS,,0,0
