# Wahapedia data card collector

## Importing libraries

Here are the libraries we will use to scrape the data from Wahapedia.

In [55]:
from selenium.webdriver import FirefoxOptions as Options, Firefox as Browser
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import os
import requests
import time

## Firefox webdriver

The webdriver is a tool that allows us to interact with a web browser. In this case, we will use the Firefox webdriver to interact with the Wahapedia website.

In [56]:
opts = Options()

opts.add_argument("--width=2560")
opts.add_argument("--height=1440")

driver = Browser(options=opts)

## Ublock Origin

We start by adding the Ublock Origin extension to our browser. This will allow us to block ads and pop-ups that could interfere with our scraping.

In [57]:
ublock_url = "https://addons.mozilla.org/firefox/downloads/latest/ublock-origin/addon-1318898-latest.xpi"
ublock_path = "../docs/assets/extensions/ublock_origin.xpi"

if not os.path.exists(ublock_path):
    response = requests.get(ublock_url)
    with open(ublock_path, "wb") as file:
        file.write(response.content)

driver.install_addon(ublock_path)

'uBlock0@raymondhill.net'

## Output directory

We check if the output directory exists, and if it doesn't, we create it.

In [58]:
output_dir = "../out/factions/"
os.makedirs(output_dir, exist_ok=True)

## Select the sources

In [59]:
base_url = "https://wahapedia.ru/wh40k10ed/"
factions_url = base_url + "factions/"
home_url = "https://wahapedia.ru/wh40k10ed/the-rules/quick-start-guide/"

## Fetch the factions names

In [60]:
def get_names_from_html(html):
    links = html.find_elements(By.TAG_NAME, "a")
    hrefs = [link.get_attribute('href') for link in links]
    names = [href.split("/")[-1] for href in hrefs]
    
    # Remove datasheets.html as it is not a valid name
    names = [name for name in names if name != "datasheets.html"]
    return names

In [61]:
def get_factions():
    driver.get(home_url)
    
    button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div[1]/div[2]/div[5]/div[2]/div'))
    )

    units_names = get_names_from_html(button)
    
    return units_names

In [62]:
factions = get_factions()

## Fetch the units names

In [63]:
def fetch_units_name_from_faction(faction):
    driver.get(factions_url + faction)
    
    # Refuse cookies
    # cookies_button = WebDriverWait(driver, 10).until(
    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-manage-settings"]'))
    # )
    # cookies_button.click()
    
    # save_exit_button = WebDriverWait(driver, 10).until(
    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-save-settings"]'))
    # )
    # save_exit_button.click()
    
    # # Remove the annoying elements
    # driver.execute_script(
    #     """
    #     document.querySelector("#ezPrivacyCenter").remove();
    #     document.querySelector("#btnBackToTop").remove();
    #     """
    # )
    
    # Wait for the button to be present
    button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="btnArmyList"]'))
    )

    # Create an ActionChains object
    actions = ActionChains(driver)

    # Hover over the button
    actions.move_to_element(button).perform()

    # Get the tooltip content
    time.sleep(1)
    tooltip = driver.find_elements(By.XPATH, '//*[@id="tooltip_contentArmyList"]')

    # Extract the unit names from the tooltip popup
    units_names = get_names_from_html(tooltip[1])
    
    return units_names

In [65]:
factions = fetch_units_name_from_faction("t-au-empire")

## Fetch the data card

We define the function we are going to use, then we fetch the data according to the defined sources.

In [66]:
def fetch_clean_image(faction, unit):
    # Gets the page
    driver.get(factions_url + faction + "/" + unit)
    
    # # Refuse cookies
    # cookies_button = WebDriverWait(driver, 10).until(
    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-manage-settings"]'))
    # )
    # cookies_button.click()
    
    # save_exit_button = WebDriverWait(driver, 10).until(
    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-save-settings"]'))
    # )
    # save_exit_button.click()
    
    # Remove the annoying elements
    # driver.execute_script(
    #     """
    #     document.querySelector("#ezPrivacyCenter").remove();
    #     document.querySelector("#btnArmyList").remove();
    #     document.querySelector("#btnBackToTop").remove();
    #     """
    # )
    
    driver.execute_script(
        """
        document.querySelector("#btnArmyList").remove();
        """
    )
    
    # Isolate the card and take a screenshot
    data_card = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="wrapper"]/div[4]'))
    )
    os.makedirs(output_dir + faction, exist_ok=True)
    data_card.screenshot(output_dir + faction + "/" + unit + ".png")

In [67]:
fetch_clean_image("aeldari", "Hemlock-Wraithfighter")

## Finish job

In [68]:
driver.quit()