In [1]:
import os
import re
import time
import shutil
import requests
from typing import List, Dict, Tuple
from random import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

os.environ['PATH'] += '.'  # Add webdriver to PATH

def create_folder(folder_name: str, root: str) -> None:
    full_path = os.path.join(root, folder_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path, exist_ok=True, )

# Create a folder for dataset    
create_folder(folder_name='dataset', root="./")

# Configure file paths
base_path = f"https://brickset.com/minifigs/"
# current_category = "category-Adventurers"
# current_category_path = f"{BASE_PATH}/{current_category}"

# Configure the size of the window
options = Options()
options.add_argument("--window-size=1920,1080")

# Intitialize the driver
driver = webdriver.Chrome(options=options, service=Service(ChromeDriverManager().install()))
driver.get(base_path)
driver.maximize_window() # Fit the window to your screen

### Helper functions
def clean_name(minifigure_name: str) -> str:
    invalid_chars = r'[\\/:\*\?"<>\|]\& \''
    for c in invalid_chars:
        minifigure_name = minifigure_name.replace(c, '-')
    return re.sub(r'-+', '-', minifigure_name)
    
def clean_category_name(category: str) -> str:
    """
    Cleans the category name
    """

    # Replace all numbers with empty characters
    result = re.sub(r'\d+', '', category)

    # Replace parentheses '()' with empty characters
    result = result.replace('(', '').replace(')', '')

    # Replace all spaces with dash
    result = result.strip().replace(' ', '-')
    
    return result.lower()



In [2]:

def get_categories()-> Tuple[Dict[str, str], Dict[str, int]]:
    """
    Get the list of all categories of minifigures.
    """

    # Find the "Category" button and click on it
    button = driver.find_element(By.XPATH, "//span[@class='selectboxit-text' and text()='Category']")
    button.click()

    # Find all the <li> elements within the dropdown menu
    category_elements = driver.find_elements(By.XPATH, "//ul[@class='selectboxit-options selectboxit-list']//li")

    # Scrape the links and category names and store them in a dictionary
    category_to_link = {} # Maps category name to relative link
    category_to_count = {} # Maps category name to the number of minifigures in that category
    for element in category_elements:
        # Extract the link
        link = element.get_attribute("data-val")
        category = clean_category_name(element.text)
        category_to_link[category] = link
        
        # Extract the category count
        category_to_count[category] = re.findall(r'\d+', element.text)
    
    try:
        del category_to_link['']
    except KeyError:
        pass
    
    return category_to_link, category_to_count

In [3]:
def scrape() -> None:
    """
    Function for scraping a single page. It scrapes the minifigures images and names.
    """
    print(f"Scraping images...")

    # Find things on page
    # all_img = driver.find_elements(By.XPATH, "//img")
    all_img = driver.find_elements(By.XPATH, "//article[@class='set']//img")
    all_minifigure_names = driver.find_elements(By.XPATH, "//article[@class='set']/div[@class='meta']/h1/a")


    # Check if the number of images and captions is equal
    assert len(all_img) == len(all_minifigure_names), f"all_img ({len(all_img)}) and all_minifigure_names ({len(all_minifigure_names)}) should lenghts be equal!"

    # Loop trough all images on page
    for idx, image in enumerate(all_img):
        src = image.get_attribute('src')
        response = requests.get(src, stream=True)

        # Get the original name (it can be usefull for Condiditonal GANs)
        minifigure_name = clean_name(all_minifigure_names[idx].get_attribute('text'))

        # Save image as .png
        with open(f'./dataset/{category}/{minifigure_name}.png', 'wb') as file:
            shutil.copyfileobj(response.raw, file)

In [4]:
def scrape_all_pages(url: str) -> None:
    """
    Gets a url to a specific minifigures category and scrapes all the images (from sub page 1, 2, 3, ... n).
    """

    # Visit the first page
    driver.get(url)

    while True:

        # Call the scrape() function on the current page
        scrape()

        # Find the 'next' button and check if it is disabled (last page)
        next_button = driver.find_element(By.CSS_SELECTOR, "li.next a")
        next_disabled = next_button.get_attribute("aria-disabled")

        # If 'next' button is disabled, it means we reached the last page, so break the loop
        if next_disabled == "true":
            break

        # Otherwise, click the 'next' button to go to the next page
        driver.execute_script("arguments[0].click();", next_button)

        # Wait for a few seconds (you can adjust the time if needed)
        time.sleep(5)
    
    # Exiting driver after scraping all pages from given category
    driver.quit()


In [None]:
# TODO: Code that checks how many images should be in a category and how many are there now
# The information about the desired number of images is removed by the clean_category function
#  so you need to change this.
# This feature will allow you to scrape only the categories that have missing images.
# This approach saves a lot of time! ;-)

# TODO: Implement a solution that closes pop up windows with adds

In [6]:
### Main Scraping Loop

category_to_link, category_to_number = get_categories()

print(f"Scraping lego minifigure images from {len(category_to_link.values())} categopries! :-)")
for category, link in category_to_link.items():
    # Create folder to store images
    create_folder(folder_name=category, root="./dataset")

    # Access the full link
    full_link = "https://brickset.com" + link

    print(f"Current scraped category: {category.upper()} from: {full_link}")

    # Scrape all images from given category and close the page
    scrape_all_pages(full_link)

    # Open the main page again
    driver.get(base_path)
    driver.maximize_window() # Fit the window to your screen

Scraping lego minifigure images from 108 categopries! :-)
Current scraped category: ADVENTURERS from: https://brickset.com/minifigs/category-Adventurers
Scraping images...
Scraping images...


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"li.next a"}
  (Session info: chrome=115.0.5790.110); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
Backtrace:
	GetHandleVerifier [0x004C1C53+49427]
	(No symbol) [0x00455F41]
	(No symbol) [0x0035C64D]
	(No symbol) [0x00389818]
	(No symbol) [0x003898DB]
	(No symbol) [0x003B87D2]
	(No symbol) [0x003A4A64]
	(No symbol) [0x003B6F2A]
	(No symbol) [0x003A4816]
	(No symbol) [0x00381127]
	(No symbol) [0x003822AD]
	GetHandleVerifier [0x00717149+2496009]
	GetHandleVerifier [0x0075D572+2783794]
	GetHandleVerifier [0x00757491+2758993]
	GetHandleVerifier [0x005411D0+571024]
	(No symbol) [0x0045F96A]
	(No symbol) [0x0045BD88]
	(No symbol) [0x0045BE6B]
	(No symbol) [0x0044EA97]
	BaseThreadInitThunk [0x76E500C9+25]
	RtlGetAppContainerNamedObjectPath [0x77867B1E+286]
	RtlGetAppContainerNamedObjectPath [0x77867AEE+238]


In [None]:
driver.quit()