In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from PIL import Image
from io import BytesIO
import re

# The script is used to webscrape images from various sources for the Monster Idnetifier.

This notebook supports the "MonsterIdentifier" notebook which outline the main summary of this project. This script simply contains the code I used to webscrape imagers from various wikipedia pages.

## Digimon Images

The first set of images we scraped from are digimon monster images from the digimon wikipedia page: https://wikimon.net/Visual_List_of_Digimon

In [None]:
digimon_DIR = "digimon_images" #Directory to store the images
os.makedirs(digimon_DIR, exist_ok=True)

Digi_URL = "https://wikimon.net/Visual_List_of_Digimon"

In [None]:
# Words to skip in image URLs
SKIP_WORDS = ["logo", "emblem", "Collectors", "Icon", "illustration", "Da-", "Bo-"]

downloaded_basenames = set()

def get_all_images_from_main_page(page_url):
    """Get all valid images directly from the main Digimon list page."""
    try:
        resp = requests.get(page_url)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        img_tags = soup.select("img")
        img_urls = []

        for img in img_tags:
            src = img.get("src")
            if not src:
                continue
            url = urljoin(page_url, src)

            # Skipwords
            if any(word.lower() in url.lower() for word in SKIP_WORDS):
                continue

            #The main image/thumbnail shown on the page was a 120px version but we want the 240px
            # Therefore we replace 120px (thumbnail) with 240px variant
            url = re.sub(r'/\d+px-', '/240px-', url)

            img_urls.append(url)

        return list(set(img_urls))  # remove duplicates

    except Exception as e:
        print("Error fetching images from main page:", e)
        return []

def download_image(url, folder):
    """Download image if it passes filters and hasn't been downloaded yet."""
    try:
        path = urlparse(url).path
        filename = os.path.basename(path)
        filename = filename.split("?")[0]
        basename_no_ext = os.path.splitext(filename)[0]

        if basename_no_ext in downloaded_basenames:
            print("Already downloaded:", basename_no_ext)
            return

        r = requests.get(url)
        r.raise_for_status()
        img = Image.open(BytesIO(r.content))

        # Any image that's small we don;t want so just skip these
        if img.width < 100 or img.height < 100:
            print("Skipped small image:", filename, f"({img.width}x{img.height})")
            return

        # Save safely
        filename = re.sub(r'[^a-zA-Z0-9\-_\.]', '_', filename)
        save_path = os.path.join(folder, filename)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        img.save(save_path)
        print("Downloaded:", filename)
        downloaded_basenames.add(basename_no_ext)

    except Exception as e:
        print("Failed to download", url, e)

def scrape_digimon_main_page():
    img_urls = get_all_images_from_main_page(Digi_URL)
    print("Found images on main page:", len(img_urls))
    for url in img_urls:
        download_image(url, digimon_DIR)


In [None]:
#Uncomment when you want to run the scraper

#scrape_digimon_main_page()

## Monster Hunter scraper

Next we scraped images of the various monsters in the monster hunter games from this wiki page: https://monsterhunter.fandom.com/wiki/Category:Monsters and this one: "https://monsterhunter.fandom.com/wiki/Category:Photo_Galleries"

In [None]:
#Define URLS and directories for Monster Hunter
#monsterHunter_URL = "https://monsterhunter.fandom.com/wiki/Category:Monsters"

categories = [
    "https://monsterhunter.fandom.com/wiki/Category:Monsters",
    "https://monsterhunter.fandom.com/wiki/Category:Photo_Galleries"
]

monsterHunter_DIR = "monsterHunter_images"

os.makedirs(monsterHunter_DIR, exist_ok=True)

In [3]:
def get_monster_links_from_category(category_url):
    """Get all monster page links from category page."""
    try:
        resp = requests.get(category_url)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        links = soup.select("div.category-page__members a[href^='/wiki/']")
        monster_links = []
        for link in links:
            href = link.get("href")
            if href and not href.startswith("/wiki/Category:"):
                full_url = urljoin(category_url, href)
                monster_links.append(full_url)
        print(f"Found {len(monster_links)} monster links in category: {category_url}")
        return monster_links
    except Exception as e:
        print("Error fetching monster links:", e)
        return []

#I noticed that there were a lot of concept art or comic images. What I wanted were images with 'render' in the URL as they allhad the same style
def get_all_images_from_monster_page(monster_url):
    """Get all image URLs from a monster page that contain 'render'."""
    try:
        resp = requests.get(monster_url)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        img_tags = []

        # Try main profile image first, these are the 'nice' monster images we want
        img_tags.extend(soup.select("aside.portable-infobox img"))

        # Try any image in content / gallery next
        img_tags.extend(soup.select("div.mw-parser-output img"))
        img_tags.extend(soup.select("figure.pi-image-collection img"))

        img_urls = []
        for img_tag in img_tags:
            if img_tag and img_tag.get("src"):
                url = urljoin(monster_url, img_tag["src"])
                # Only include proper renders / large images (skip thumbnails)
                if ("scale-to-width-down" in url or "revision/latest" in url) and "render" in url.lower():
                    img_urls.append(url)

        return list(set(img_urls))  # remove duplicates
    except Exception as e:
        print("Error fetching images from monster page:", monster_url, e)
        return []

def download_image(url, folder):
    """Download and save image from URL only if it doesn't already exist."""
    try:
        path_parts = urlparse(url).path.split("/")
        filename = next((part for part in reversed(path_parts) if "." in part), "image.png")
        filename = filename.split("?")[0]
        filename = re.sub(r'[^a-zA-Z0-9\-_\.]', '_', filename)

        save_path = os.path.join(folder, filename)

        if os.path.exists(save_path):
            print("Already exists, skipping:", filename)
            return

        response = requests.get(url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        img.save(save_path)
        print("Downloaded:", filename)

    except Exception as e:
        print("Error downloading image:", url, e)

def webScrapeMH(categories, output_dir):
    """Scrape multiple categories for monster images."""
    for category_url in categories:
        print(f"Scraping category: {category_url}")
        monster_links = get_monster_links_from_category(category_url)

        for monster_url in monster_links:
            img_urls = get_all_images_from_monster_page(monster_url)
            for img_url in img_urls:
                download_image(img_url, output_dir)


In [None]:
# webScrapeMH(categories, monsterHunter_DIR)

## Palworld Scraper

Next we webscrape palworld images from these wiki sites: https://palworld.fandom.com/wiki/Category:Images_-_Pals,

    "https://palworld.fandom.com/wiki/Alpha_Pals",
    "https://palworld.fandom.com/wiki/Breeding",
    "https://palworld.fandom.com/wiki/Lucky_Pals",
    "https://palworld.fandom.com/wiki/Legendary_Pals"

In [None]:
Palword_DIR = "palworld_images"
os.makedirs(Palword_DIR, exist_ok=True)

palworld_URL = "https://palworld.fandom.com/wiki/Category:Images_-_Pals"

In [None]:
def get_all_category_pages(category_url):
    """Get all paginated category pages."""
    pages = [category_url]
    while True:
        resp = requests.get(pages[-1])
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        next_link = soup.select_one("a.category-page__pagination-next")
        if next_link and next_link.get("href"):
            next_page = urljoin(category_url, next_link["href"])
            if next_page not in pages:
                pages.append(next_page)
            else:
                break
        else:
            break
    return pages

def get_file_page_urls_pal(category_url):
    """Get all file page URLs from a category (including all paginated pages)."""
    file_urls = []
    pages = get_all_category_pages(category_url)
    print(f"Found {len(pages)} pages in category")

    for page in pages:
        resp = requests.get(page)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        # more general: select all <a> inside the members section
        for link in soup.select("div.category-page__members a"):
            href = link.get("href")
            if href and "File:" in href:
                full_url = urljoin(category_url, href)
                if full_url not in file_urls:
                    file_urls.append(full_url)
    return file_urls

def get_high_res_image_url_pal(file_page_url):
    """Get the high-res image from a Fandom file page, handling redirects like ?file=."""
    resp = requests.get(file_page_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # Try <div class="fullMedia"> first
    img_link = soup.select_one("div.fullMedia a")
    if img_link and img_link.get("href"):
        return img_link["href"]
    
    # fallback: <a class="image">
    img_link = soup.select_one("a.image")
    if img_link and img_link.get("href"):
        return urljoin(file_page_url, img_link["href"])
    
    # fallback: first <img> tag
    img_tag = soup.select_one("img")
    if img_tag and img_tag.get("src"):
        return urljoin(file_page_url, img_tag["src"])
    
    print("No high-res image found for:", file_page_url)
    return None

def download_image_pal(url, folder):
    """Download and save image from URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))

        path_parts = urlparse(url).path.split("/")
        filename = next((part for part in reversed(path_parts) if "." in part), "image.png")
        filename = filename.split("?")[0]
        filename = re.sub(r'[^a-zA-Z0-9\-_\.]', '_', filename)

        save_path = os.path.join(folder, filename)
        os.makedirs(folder, exist_ok=True)
        img.save(save_path)
        print("Downloaded:", filename)
    except Exception as e:
        print("Error downloading", url, e)

def webScrapePal(BASE_URL, OUTPUT_DIR):
    file_pages = get_file_page_urls_pal(BASE_URL)
    print(f"Found {len(file_pages)} file pages total")

    for file_page in file_pages:
        img_url = get_high_res_image_url_pal(file_page)
        if img_url:
            download_image_pal(img_url, OUTPUT_DIR)
        else:
            print("No high-res image found for:", file_page)

In [None]:
webScrapePal(palworld_URL, Palword_DIR)

Found 2 pages in category
Found 267 file pages total
Downloaded: Anubis.png
Downloaded: Anubis.png
Downloaded: Nitewing.png
Downloaded: Arsox.png
Downloaded: Astegon.png
Downloaded: TerrariaPalworld.png
Downloaded: TerrariaPalworld.png
Downloaded: Azurobe.png
Downloaded: Azurobe.png
Downloaded: TerrariaPalworld.png
Downloaded: Leather_icon.png
Downloaded: TerrariaPalworld.png
Downloaded: Beegarde.png
Downloaded: Leather_icon.png
Downloaded: Leather_icon.png
Downloaded: Leather_icon.png
Downloaded: Boltmane.png
Downloaded: Bristla.png
Downloaded: Broncherry_Aqua.png
Downloaded: Broncherry.png
Downloaded: Bushi.png
Downloaded: TerrariaPalworld.png
Downloaded: Caprity.png
Downloaded: Cattiva.png
Downloaded: Cawgnito.png
Downloaded: Cawgnito.png
Downloaded: Cawgnito.png
Downloaded: Celaray.png
Downloaded: TerrariaPalworld.png
Downloaded: Celaray.png
Downloaded: TerrariaPalworld.png
Downloaded: Chikipi.png
Downloaded: Chikipi.png
Downloaded: Pals.png
Downloaded: Chikipi.png
Downloaded: Chil

alternate palworld image locations:

In [None]:
def download_image(url, folder):
    """Download and save image from URL only if it doesn't already exist."""
    try:
        # Extract filename from URL
        path_parts = urlparse(url).path.split("/")
        filename = None
        for part in reversed(path_parts):
            if "." in part:
                filename = part
                break
        if not filename:
            filename = "image.png"

        filename = filename.split("?")[0]
        filename = re.sub(r'[^a-zA-Z0-9\-_\.]', '_', filename)

        save_path = os.path.join(folder, filename)

        # Skip download if already exists
        if os.path.exists(save_path):
            print("Already exists, skipping:", filename)
            return

        # Download image
        response = requests.get(url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        img.save(save_path)
        print("Downloaded:", filename)

    except Exception as e:
        print("Error downloading", url, e)

def get_pal_links_from_table(page_url):
    """Get all Pal page links from the main table on a page like Alpha_Pals."""
    try:
        resp = requests.get(page_url)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        # The table has links to individual Pal pages
        links = soup.select("table a[href^='/wiki/']")
        pal_links = []
        for link in links:
            href = link.get("href")
            if href and not href.startswith("/wiki/Category:"):
                full_url = urljoin(page_url, href)
                pal_links.append(full_url)

        print(f"Found {len(pal_links)} Pal links on page: {page_url}")
        return pal_links

    except Exception as e:
        print("Error getting Pal links from table:", e)
        return []

def get_main_image_from_pal_page(pal_page_url):
    """Get the main profile image URL from a Pal page."""
    try:
        resp = requests.get(pal_page_url)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        # Usually the main image is in <aside class="portable-infobox"> or first <img> in content
        img_tag = soup.select_one("aside.portable-infobox img")
        if not img_tag:
            img_tag = soup.select_one("div.mw-parser-output img")

        if img_tag and img_tag.get("src"):
            return urljoin(pal_page_url, img_tag["src"])
        else:
            print("No image found for:", pal_page_url)
            return None

    except Exception as e:
        print("Error getting image from Pal page:", pal_page_url, e)
        return None


In [None]:
# Start scraping
pal_pages_urls = [
    "https://palworld.fandom.com/wiki/Alpha_Pals",
    "https://palworld.fandom.com/wiki/Breeding",
    "https://palworld.fandom.com/wiki/Lucky_Pals",
    "https://palworld.fandom.com/wiki/Legendary_Pals"
]

for page_url in pal_pages_urls:
    print(f"Scraping page: {page_url}")
    pal_links = get_pal_links_from_table(page_url)
    
    for pal_url in pal_links:
        img_url = get_main_image_from_pal_page(pal_url)
        if img_url:
            #download_image(img_url, Palword_DIR)

# yugipedia Images
Next we get a bunch more yugio images. We get them from this wiki site: https://yugipedia.com/wiki/Category:Yu-Gi-Oh!_Duel_Links_monster_images using the selenium package. We only grab images from the first 3 pages of this site (more than enough images).

In [None]:
#Uncomment the url you want to scrape from. The each represent the next page of the wiki site.
urls = [
    #"https://yugipedia.com/wiki/Category:Yu-Gi-Oh!_Duel_Links_monster_images",
    #"https://yugipedia.com/index.php?title=Category:Yu-Gi-Oh!_Duel_Links_monster_images&filefrom=ArcanaForceXXITheWorld-DULI-EN-VG-NC.png#mw-category-media",
    #"https://yugipedia.com/index.php?title=Category:Yu-Gi-Oh!_Duel_Links_monster_images&filefrom=BerserkGorilla-DULI-EN-VG-NC.png#mw-category-media"
]

yugioh_DIR = "yugipedia_images"

os.makedirs(yugioh_DIR, exist_ok=True)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# --- Selenium setup ---
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(3000) 

folder = yugioh_DIR
os.makedirs(folder, exist_ok=True)

for url in urls:
    print("Scraping page:", url)
    try:
        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # --- Find all images ---
        gallery_items = soup.find_all("li", class_="gallerybox")
        for item in gallery_items:
            img_tag = item.find("img")
            if img_tag:
                img_url = img_tag.get("src")
                # Convert relative URL to absolute if needed
                if img_url.startswith("//"):
                    img_url = "https:" + img_url
                
                # Get filename from URL
                filename = os.path.join(folder, img_url.split("/")[-1].split("?")[0])
                
                # Skip if already downloaded
                if os.path.exists(filename):
                    print("Already exists:", filename)
                    continue

                # Download image
                try:
                    response = requests.get(img_url, stream=True)
                    if response.status_code == 200:
                        with open(filename, 'wb') as f:
                            for chunk in response.iter_content(1024):
                                f.write(chunk)
                        print(f"Downloaded: {filename}")
                    else:
                        print(f"Failed to download {img_url}")
                except Exception as e:
                    print(f"Error downloading {img_url}: {e}")
    except Exception as e:
        print(f"Error fetching page {url}: {e}")

driver.quit()


With that, we now have a bunch of images from various sources and monster genres. Ideally, we want images with a creature/mosnter at the center with a white or transparent background. However these webscrapers definetly picked up some additional iamges, which is why I then manually went through each folder and just deleted images that looked wrong (concept art, comic images, other stuff).