# Wahapedia data card collector

## Importing libraries

Here are the libraries we will use to scrape the data from Wahapedia.

In [176]:
from selenium.webdriver import FirefoxOptions as Options, Firefox as Browser
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import os
import requests
import time
import json

## Firefox webdriver

The webdriver is a tool that allows us to interact with a web browser. In this case, we will use the Firefox webdriver to interact with the Wahapedia website.

In [177]:
opts = Options()

opts.add_argument("--width=2560")
opts.add_argument("--height=1440")

driver = Browser(options=opts)

## Ublock Origin

We start by adding the Ublock Origin extension to our browser. This will allow us to block ads and pop-ups that could interfere with our scraping.

In [178]:
ublock_url = "https://addons.mozilla.org/firefox/downloads/latest/ublock-origin/addon-1318898-latest.xpi"
ublock_path = "../docs/assets/extensions/ublock_origin.xpi"

if not os.path.exists(ublock_path):
    response = requests.get(ublock_url)
    with open(ublock_path, "wb") as file:
        file.write(response.content)

driver.install_addon(ublock_path)

'uBlock0@raymondhill.net'

## Output directory

We check if the output directory exists, and if it doesn't, we create it.

In [179]:
output_dir = "../out/factions/"
source_dir = "../out/source/"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(source_dir, exist_ok=True)

## Select the sources

In [180]:
base_url = "https://wahapedia.ru/wh40k10ed/"
factions_url = base_url + "factions/"
home_url = "https://wahapedia.ru/wh40k10ed/the-rules/quick-start-guide/"

## Fetch the factions names

In [181]:
def get_names_from_html(html):
    links = html.find_elements(By.TAG_NAME, "a")
    hrefs = [link.get_attribute('href') for link in links]
    names = [href.split("/")[-1] for href in hrefs]
    
    # Remove datasheets.html as it is not a valid name
    names = [name for name in names if name != "datasheets.html"]
    return names

In [182]:
def get_factions_names():
    driver.get(home_url)
    
    button = WebDriverWait(driver, 3).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div[1]/div[2]/div[5]/div[2]/div'))
    )

    units_names = get_names_from_html(button)
    
    return units_names

In [183]:
factions = {name: None for name in get_factions_names()}

In [184]:
factions

{'adepta-sororitas': None,
 'adeptus-custodes': None,
 'adeptus-mechanicus': None,
 'adeptus-titanicus': None,
 'astra-militarum': None,
 'grey-knights': None,
 'imperial-agents': None,
 'imperial-knights': None,
 'space-marines': None,
 'chaos-daemons': None,
 'chaos-knights': None,
 'chaos-space-marines': None,
 'death-guard': None,
 'thousand-sons': None,
 'world-eaters': None,
 'aeldari': None,
 'drukhari': None,
 'genestealer-cults': None,
 'leagues-of-votann': None,
 'necrons': None,
 'orks': None,
 't-au-empire': None,
 'tyranids': None,
 'unaligned-forces': None}

## Fetch the units names

In [185]:
def fetch_units_name_from_faction(faction):
    driver.get(factions_url + faction)
    
    # Refuse cookies
    try:
        cookies_button = WebDriverWait(driver, 1).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-manage-settings"]'))
        )
        cookies_button.click()
    except Exception as e:
        print("Cookies button not found ("+faction+")")
    
    try:
        save_exit_button = WebDriverWait(driver, 1).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-save-settings"]'))
        )
        save_exit_button.click()
    except Exception as e:
        print("Save and exit button not found ("+faction+")")
    
    # Remove the annoying elements
    try:
        driver.execute_script(
            """
            document.querySelector("#ezPrivacyCenter").remove();
            document.querySelector("#btnBackToTop").remove();
            """
        )
    except Exception as e:
        print("Annoying elements not found ("+faction+")")
    
    # Wait for the button to be present
    button = WebDriverWait(driver, 1).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="btnArmyList"]'))
    )

    # Create an ActionChains object
    actions = ActionChains(driver)

    # Hover over the button
    actions.move_to_element(button).perform()

    # Get the tooltip content
    time.sleep(1)
    tooltip = driver.find_elements(By.XPATH, '//*[@id="tooltip_contentArmyList"]')

    # Extract the unit names from the tooltip popup
    units_names = get_names_from_html(tooltip[1])
    
    return units_names

In [186]:
for key in factions.keys():
    factions[key] = fetch_units_name_from_faction(key)

Cookies button not found (adeptus-custodes)
Save and exit button not found (adeptus-custodes)
Cookies button not found (adeptus-mechanicus)
Save and exit button not found (adeptus-mechanicus)
Cookies button not found (adeptus-titanicus)
Save and exit button not found (adeptus-titanicus)
Cookies button not found (astra-militarum)
Save and exit button not found (astra-militarum)
Cookies button not found (grey-knights)
Save and exit button not found (grey-knights)
Cookies button not found (imperial-agents)
Save and exit button not found (imperial-agents)
Cookies button not found (imperial-knights)
Save and exit button not found (imperial-knights)
Cookies button not found (space-marines)
Save and exit button not found (space-marines)
Cookies button not found (chaos-daemons)
Save and exit button not found (chaos-daemons)
Cookies button not found (chaos-knights)
Save and exit button not found (chaos-knights)
Cookies button not found (chaos-space-marines)
Save and exit button not found (chaos

In [187]:
factions

{'adepta-sororitas': ['Aestred-Thurga-And-Agathae-Dolan',
  'Canoness',
  'Canoness-with-Jump-Pack',
  'Daemonifuge',
  'Dialogus',
  'Dogmata',
  'Hospitaller',
  'Imagifier',
  'Junith-Eruita',
  'Ministorum-Priest',
  'Morvenn-Vahl',
  'Palatine',
  'Saint-Celestine',
  'Triumph-Of-Saint-Katherine',
  'Celestian-Sacresant-Aveline',
  'Battle-Sisters-Squad',
  'Immolator',
  'Sororitas-Rhino',
  'Repressor',
  'Battle-Sanctum',
  'Arco-flagellants',
  'Castigator',
  'Celestian-Sacresants',
  'Dominion-Squad',
  'Exorcist',
  'Mortifiers',
  'Paragon-Warsuits',
  'Penitent-Engines',
  'Repentia-Squad',
  'Retributor-Squad',
  'Seraphim-Squad',
  'Sisters-Novitiate-Squad',
  'Zephyrim-Squad',
  'Crusaders',
  'Death-Cult-Assassins'],
 'adeptus-custodes': ['Aleya',
  'Blade-Champion',
  'Knight-centura',
  'Shield-captain',
  'Shield-captain-In-Allarus-Terminator-Armour',
  'Shield-captain-On-Dawneagle-Jetbike',
  'Trajann-Valoris',
  'Valerian',
  'Custodian-Guard',
  'Anathema-Psykan

In [200]:
def save_dict_to_json(dictionary, path):
    with open(source_dir + path + ".json", "w") as file:
        json.dump(dictionary, file)

In [189]:
save_dict_to_json(factions, "source")

## Fetch the data card

We define the function we are going to use, then we fetch the data according to the defined sources.

In [190]:
def fetch_clean_image(faction, unit):
    # Gets the page
    driver.get(factions_url + faction + "/" + unit)
    
    # Refuse cookies
    try:
        cookies_button = WebDriverWait(driver, 1).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-manage-settings"]'))
        )
        cookies_button.click()
    except Exception as e:
        print("Cookies button not found ("+faction+" - "+unit+")")
    
    try:
        save_exit_button = WebDriverWait(driver, 1).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="ez-save-settings"]'))
        )
        save_exit_button.click()
    except Exception as e:
        print("Save and exit button not found ("+faction+" - "+unit+")")
    
    # Remove the annoying elements
    try:
        driver.execute_script(
            """
            document.querySelector("#ezPrivacyCenter").remove();
            document.querySelector("#btnBackToTop").remove();
            """
        )
    except Exception as e:
        print("Annoying elements not found ("+faction+" - "+unit+")")
    
    driver.execute_script(
        """
        document.querySelector("#btnArmyList").remove();
        """
    )
    
    # Isolate the card and take a screenshot
    data_card = WebDriverWait(driver, 1).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="wrapper"]/div[4]'))
    )
    os.makedirs(output_dir + faction, exist_ok=True)
    data_card.screenshot(output_dir + faction + "/" + unit + ".png")

In [191]:
def find_temp_file():
    for file in os.listdir(source_dir):
        if file.startswith("temp"):
            return file
    return None

In [192]:
def load_json_dict(path):
    with open(source_dir + path, "r") as file:
        return json.load(file)

In [193]:
def remove_temp_json():
    temp_file = find_temp_file()
    if temp_file:
        os.remove(source_dir + temp_file)

In [204]:
def load_temp_json_if_exists():
    temp_file = find_temp_file()
    if temp_file:
        return load_json_dict(temp_file)
    return None

In [205]:
def init_factions_to_fetch():
    temp_dict = load_temp_json_if_exists()
    if temp_dict:
        return temp_dict
    save_dict_to_json(factions, "temp")
    return factions.copy()

In [206]:
factions_to_fetch = init_factions_to_fetch()

try:
    for key in factions_to_fetch.keys():
        for val in factions_to_fetch[key]:
            fetch_clean_image(key, val)
            factions_to_fetch[key].remove(val)
    remove_temp_json()
            
except Exception as e:
    print(e)
    save_dict_to_json(factions_to_fetch, "temp")


Cookies button not found (adepta-sororitas - Canoness)
Save and exit button not found (adepta-sororitas - Canoness)
Cookies button not found (adepta-sororitas - Dogmata)
Save and exit button not found (adepta-sororitas - Dogmata)
Cookies button not found (adepta-sororitas - Ministorum-Priest)
Save and exit button not found (adepta-sororitas - Ministorum-Priest)
Cookies button not found (adepta-sororitas - Triumph-Of-Saint-Katherine)
Save and exit button not found (adepta-sororitas - Triumph-Of-Saint-Katherine)
Cookies button not found (adepta-sororitas - Sororitas-Rhino)
Save and exit button not found (adepta-sororitas - Sororitas-Rhino)
Cookies button not found (adepta-sororitas - Castigator)
Save and exit button not found (adepta-sororitas - Castigator)
Cookies button not found (adepta-sororitas - Mortifiers)
Save and exit button not found (adepta-sororitas - Mortifiers)
Cookies button not found (adepta-sororitas - Retributor-Squad)
Save and exit button not found (adepta-sororitas - 

KeyboardInterrupt: 

In [207]:
save_dict_to_json(factions_to_fetch, "temp")

## Finish job

In [None]:
driver.quit()