# Wahapedia data card collector

## Importing libraries

Here are the libraries we will use to scrape the data from Wahapedia.

In [None]:
from selenium.webdriver import FirefoxOptions as Options, Firefox as Browser
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import requests

## Firefox webdriver

The webdriver is a tool that allows us to interact with a web browser. In this case, we will use the Firefox webdriver to interact with the Wahapedia website.

In [None]:
opts = Options()

opts.add_argument("--width=2560")
opts.add_argument("--height=1440")

driver = Browser(options=opts)

## Ublock Origin

We start by adding the Ublock Origin extension to our browser. This will allow us to block ads and pop-ups that could interfere with our scraping.

In [None]:
ublock_url = "https://addons.mozilla.org/firefox/downloads/latest/ublock-origin/addon-1318898-latest.xpi"
ublock_path = "../docs/assets/extensions/ublock_origin.xpi"

if not os.path.exists(ublock_path):
    response = requests.get(ublock_url)
    with open(ublock_path, "wb") as file:
        file.write(response.content)

driver.install_addon(ublock_path)

## Output directory

We check if the output directory exists, and if it doesn't, we create it.

In [None]:
output_dir = "../out"
os.makedirs(output_dir, exist_ok=True)

## Accessing the webpage & dismiss cookies

We will access the Wahapedia website and dismiss the cookies notification.

In [None]:
driver.get("https://wahapedia.ru/wh40k10ed/factions/t-au-empire/Ta-unar-Supremacy-Armour")

In [None]:
cookies_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-manage-settings"]'))
)
cookies_button.click()

In [None]:
save_exit_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-save-settings"]'))
)
save_exit_button.click()

## Cleaning the page

We will remove the useless parts from the card page.

In [None]:
driver.execute_script(
    """
    document.querySelector("#ezPrivacyCenter").remove();
    document.querySelector("#btnArmyList").remove();
    document.querySelector("#btnBackToTop").remove();
    """
)

## Scraping the data

We will scrape the data from the Wahapedia website. We will collect the data from the data cards of the units.

In [None]:
data_card = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, '//*[@id="wrapper"]/div[4]'))
)
data_card.screenshot(os.path.join(output_dir, "data_card.png"))

In [None]:
driver.quit()