In [25]:
import os
import time
import shutil
import requests
from typing import List
from random import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

os.environ['PATH'] += '.'  # Add webdriver to PATH

def create_folder(folder_name: str) -> None:
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

# Create a folder for dataset    
create_folder('dataset')


In [19]:
# BASE_PATH = f"https://brickset.com/minifigs/"
# current_category = "category-Adventurers"
# current_category_path = f"{BASE_PATH}/{current_category}"

In [20]:
# Configure the size of the window
options = Options()
options.add_argument("--window-size=1920,1080")

# Intitialize the driver
driver = webdriver.Chrome(options=options, service=Service(ChromeDriverManager().install()))
driver.get("https://brickset.com/minifigs/category-Town")  # Open Tinder
driver.maximize_window() # Fit the window to your screen

In [22]:
# Set the category
category = 'town'
create_folder(folder_name=f"./dataset/{category}")

In [26]:
def scrape() -> None:
    """
    Function for scraping a single page. It scrapes the minifigures images and names.
    """
    # Find things on page
    # all_img = driver.find_elements(By.XPATH, "//img")
    all_img = driver.find_elements(By.XPATH, "//article[@class='set']//img")
    all_minifigure_names = driver.find_elements(By.XPATH, "//article[@class='set']/div[@class='meta']/h1/a")


    # Check if the number of images and captions is equal
    assert len(all_img) == len(all_minifigure_names), f"all_img ({len(all_img)}) and all_minifigure_names ({len(all_minifigure_names)}) should lenghts be equal!"

    # Loop trough all images on page
    for idx, image in enumerate(all_img):
        src = image.get_attribute('src')
        response = requests.get(src, stream=True)

        # Get the original name (it can be usefull for Condiditonal GANs)
        minifigure_name = clean_name(all_minifigure_names[idx].get_attribute('text'))

        # Save image as .png
        with open(f'./dataset/{category}/{minifigure_name}.png', 'wb') as file:
            shutil.copyfileobj(response.raw, file)

In [None]:
def scrape_all_pages(url):
    # Visit the first page
    driver.get(url)

    while True:
        # Call the scrape() function on the current page
        scrape()

        # Find the 'next' button and check if it is disabled (last page)
        next_button = driver.find_element(By.CSS_SELECTOR, "li.next a")
        next_disabled = next_button.get_attribute("aria-disabled")

        # If 'next' button is disabled, it means we reached the last page, so break the loop
        if next_disabled == "true":
            break

        # Otherwise, click the 'next' button to go to the next page
        driver.execute_script("arguments[0].click();", next_button)

        # Wait for a few seconds (you can adjust the time if needed)
        time.sleep(5)

# Start scraping from the first page
scrape_all_pages(base_url)


In [None]:
# def scrape_page(all_img: List[selenium.webdriver.remote.webelement.WebElement]) -> None:

In [None]:
driver.quit()