In [5]:
import os
import re
import time
import shutil
import requests
from typing import List
from random import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

os.environ['PATH'] += '.'  # Add webdriver to PATH

def create_folder(folder_name: str) -> None:
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

# Create a folder for dataset    
create_folder('dataset')

# Configure file paths
base_path = f"https://brickset.com/minifigs/"
# current_category = "category-Adventurers"
# current_category_path = f"{BASE_PATH}/{current_category}"

# Configure the size of the window
options = Options()
options.add_argument("--window-size=1920,1080")

# Intitialize the driver
driver = webdriver.Chrome(options=options, service=Service(ChromeDriverManager().install()))
driver.get(base_path)
driver.maximize_window() # Fit the window to your screen

In [2]:
# Get the list of all categories of minifigures

# Find the "Category" button and click on it
button = driver.find_element(By.XPATH, "//span[@class='selectboxit-text' and text()='Category']")
button.click()

# Find all the <li> elements within the dropdown menu
category_elements = driver.find_elements(By.XPATH, "//ul[@class='selectboxit-options selectboxit-list']//li")

# Scrape the links and category names and store them in a dictionary
category_to_link = {}
for element in category_elements:
    link = element.get_attribute("data-val")
    category = element.text
    category_to_link[category] = link

In [13]:
def clean_category_name(category: str) -> str:
    """
    Cleans the category name
    """

    # Replace all numbers with empty characters
    result = re.sub(r'\d+', '', category)

    # Replace parentheses '()' with empty characters
    result = result.replace('(', '').replace(')', '')

    # Replace all spaces with dash
    result = result.strip().replace(' ', '-')
    
    return result.lower()

'adventurers'

In [7]:
# Print the scraped data
for category, link in category_data.items():
    print(f"Link: {link}")

Link: /minifigs/year-0
Link: /minifigs/category-Adventurers
Link: /minifigs/category-Agents
Link: /minifigs/category-Alpha-Team
Link: /minifigs/category-Aquazone
Link: /minifigs/category-Atlantis
Link: /minifigs/category-Avatar
Link: /minifigs/category-Avatar-The-Last-Airbender
Link: /minifigs/category-Back-to-the-Future
Link: /minifigs/category-Basic
Link: /minifigs/category-Batman-I
Link: /minifigs/category-Belville
Link: /minifigs/category-BIONICLE
Link: /minifigs/category-BrickLink-Designer-Program
Link: /minifigs/category-Building-Bigger-Thinking
Link: /minifigs/category-Cars
Link: /minifigs/category-Castle
Link: /minifigs/category-Clikits
Link: /minifigs/category-Collectible-Minifigures
Link: /minifigs/category-DC-Super-Hero-Girls
Link: /minifigs/category-Dimensions
Link: /minifigs/category-Dino
Link: /minifigs/category-Dino-Attack
Link: /minifigs/category-Discovery
Link: /minifigs/category-Disney
Link: /minifigs/category-Disney-s-Mickey-Mouse
Link: /minifigs/category-DUPLO
Link:

In [3]:
# Set the category
category = 'town'
create_folder(folder_name=f"./dataset/{category}")

In [4]:
import re
def clean_name(minifigure_name: str) -> str:
    invalid_chars = r'[\\/:\*\?"<>\|]\& \''
    for c in invalid_chars:
        minifigure_name = minifigure_name.replace(c, '-')
    return re.sub(r'-+', '-', minifigure_name)
    

In [5]:
def scrape() -> None:
    """
    Function for scraping a single page. It scrapes the minifigures images and names.
    """
    # Find things on page
    # all_img = driver.find_elements(By.XPATH, "//img")
    all_img = driver.find_elements(By.XPATH, "//article[@class='set']//img")
    all_minifigure_names = driver.find_elements(By.XPATH, "//article[@class='set']/div[@class='meta']/h1/a")


    # Check if the number of images and captions is equal
    assert len(all_img) == len(all_minifigure_names), f"all_img ({len(all_img)}) and all_minifigure_names ({len(all_minifigure_names)}) should lenghts be equal!"

    # Loop trough all images on page
    for idx, image in enumerate(all_img):
        src = image.get_attribute('src')
        response = requests.get(src, stream=True)

        # Get the original name (it can be usefull for Condiditonal GANs)
        minifigure_name = clean_name(all_minifigure_names[idx].get_attribute('text'))

        # Save image as .png
        with open(f'./dataset/{category}/{minifigure_name}.png', 'wb') as file:
            shutil.copyfileobj(response.raw, file)

In [6]:
def scrape_all_pages(url):
    # Visit the first page
    driver.get(url)

    while True:
        # Call the scrape() function on the current page
        scrape()

        # Find the 'next' button and check if it is disabled (last page)
        next_button = driver.find_element(By.CSS_SELECTOR, "li.next a")
        next_disabled = next_button.get_attribute("aria-disabled")

        # If 'next' button is disabled, it means we reached the last page, so break the loop
        if next_disabled == "true":
            break

        # Otherwise, click the 'next' button to go to the next page
        driver.execute_script("arguments[0].click();", next_button)

        # Wait for a few seconds (you can adjust the time if needed)
        time.sleep(5)


In [7]:
# Start scraping from the first page
scrape_all_pages("https://brickset.com/minifigs/category-Town")

KeyboardInterrupt: 

In [None]:
driver.quit()