#### Web driver


In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager


class WebDriverHelper:
    def __init__(self, headless=True):
        self.headless = headless
        self.driver = None

    def get_driver_options(self):
        options = Options()
        if self.headless:
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--disable-images")
            options.add_argument("--remote-debugging-port=9229")
        return options

    def initialize_driver(self):
        # options = self.get_driver_options()
        service = ChromeService(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service)  # , options=options)
        return self.driver

    def quit_driver(self):
        if self.driver:
            self.driver.quit()
            self.driver = None

#### URL Helper


In [6]:
import re
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse


class URLConstructorHelper:
    @staticmethod
    def construct_group_category_url(base_url, target_group, clothing_url):
        return f"{base_url}/{target_group}/kleding/{clothing_url}"

    @staticmethod
    def construct_clothing_type_category_url(group_category_url, clothing_type):
        modified_url = re.sub(r"(kleding)", f"\\1/{clothing_type}", group_category_url)
        return modified_url

    @staticmethod
    def construct_url_based_on_params(base_url, params):
        url_parts = list(urlparse(base_url))
        query = parse_qs(url_parts[4])
        query.update(params)
        url_parts[4] = urlencode(query, doseq=True)
        return urlunparse(url_parts)

#### Generic Scraper Helper


In [7]:
import time
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException


class GenericScraperHelper:
    @staticmethod
    def get_soup(driver):
        return BeautifulSoup(driver.page_source, "html.parser")

    @staticmethod
    def set_driver_for_page_url(driver, relative_category_url, wait_time=10):
        driver.get(relative_category_url)
        WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

    @staticmethod
    def load_all_products(driver):
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

#### Guess Scraper


In [8]:
class GuessScraper:
    def __init__(self, BASE_URL, clothing_gender_type, headless=True):
        self.BASE_URL = BASE_URL
        self.clothing_gender_type = clothing_gender_type
        self.driver_helper = WebDriverHelper(headless)
        self.driver = self.driver_helper.initialize_driver()
        self.CATEGORY_DROPDOWN_CSS_LIST = (
            "div.mb-lg-0.filters-container__item.refinements__item--category "
            "div.js-collapse.refinements__wrapper.collapse.show div.filters-content "
            "ul.refinements__attribute-wrapper.values.content.pl-0.m-lg-0 "
            "li.refinements__attribute"
        )
        self.all_clothing_items = []

    @staticmethod
    def construct_group_category_url(url, target_group, clothing_url):
        return f"{url}/{target_group}/kleding/{clothing_url}"

    @staticmethod
    def construct_clothing_type_category_url(group_category_url, clothing_type):
        modified_url = re.sub(r"(kleding)", f"\\1/{clothing_type}", group_category_url)
        return modified_url

    @staticmethod
    def construct_url_based_on_params(url, params):
        url_parts = list(urlparse(url))
        query = parse_qs(url_parts[4])
        query.update(params)
        url_parts[4] = urlencode(query, doseq=True)
        return urlunparse(url_parts)

    def click_specific_button(self, button_text):
        attempts = 3
        for attempt in range(attempts):
            try:
                button = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located(
                        (
                            By.XPATH,
                            f"//div[contains(@class, 'refinements__item-button') and .//span[text()='{button_text}']]",
                        )
                    )
                )
                self.driver.execute_script("arguments[0].scrollIntoView();", button)
                clickable_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable(
                        (
                            By.XPATH,
                            f"//div[contains(@class, 'refinements__item-button') and .//span[text()='{button_text}']]",
                        )
                    )
                )
                self.driver.execute_script("arguments[0].click();", clickable_button)
                return
            except WebDriverException as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                time.sleep(5)
        raise Exception("Failed to click the dropdown button after several attempts")

    def extract_clothing_type_titles(self, css_dropdown_list):
        titles = []
        try:
            list_items = self.driver.find_elements(By.CSS_SELECTOR, css_dropdown_list)
            for item in list_items:
                title = item.get_attribute("title")
                if title:
                    title = self.convert_to_dash_format(title)
                    titles.append(title)
        except Exception as e:
            print(f"Exception occurred while extracting titles: {e}")
        return titles

    def extract_product_info(self, str_gender):
        soup = GenericScraperHelper.get_soup(self.driver)
        products = []
        try:
            description_divs = soup.select("div.small-description")
            for grid in description_divs:
                product_divs = grid.select("div.pdp-link.product-tile__pdp-link")
                for div in product_divs:
                    title = self.extract_title_info(div)
                    price_container = div.find_next("div", class_="product__price")

                    if price_container:
                        original_price, sale_price = self.extract_price_info(
                            price_container
                        )

                        product_info = {
                            "title": title,
                            "original_price": original_price,
                            "sale_price": sale_price,
                            "gender": str_gender,
                        }
                        products.append(product_info)
        except Exception as e:
            print(f"Exception occurred while extracting product information: {e}")
        return products

    @staticmethod
    def extract_price_info(price_container):

        original_price_tag = price_container.find(
            "span", class_="price__strike-through"
        )
        if original_price_tag == None:
            original_price_tag = price_container.find(
                "span", class_="value price__value"
            )
        sale_price_tag = price_container.find("span", class_="value price__value--sale")

        sale_price_tag = price_container.find("span", class_="value price__value--sale")

        original_price = (
            original_price_tag.get_text(strip=True) if original_price_tag else ""
        )
        sale_price = sale_price_tag.get_text(strip=True) if sale_price_tag else ""

        return original_price, sale_price

    @staticmethod
    def extract_title_info(product_div):
        title_tag = product_div.find("a", class_="link product-tile__link js-tile-link")
        return title_tag.get_text(strip=True) if title_tag else ""

    @staticmethod
    def convert_to_dash_format(title):
        cleaned_phrase = title.lower().strip()
        dash_format = cleaned_phrase.replace(" ", "-").lower()
        return dash_format

    def create_clothing_links_from_titles(self, url, titles):
        clothing_type_urls = []
        for title in titles:
            clothing_type_urls.append(
                self.construct_clothing_type_category_url(url, title)
            )
        return clothing_type_urls

    def scrape(self, clothing_base_urls):
        for clothing_url in clothing_base_urls:
            relative_clothing_category_url = (
                URLConstructorHelper.construct_group_category_url(
                    self.BASE_URL, self.clothing_gender_type, clothing_url
                )
            )
            GenericScraperHelper.set_driver_for_page_url(
                self.driver, relative_clothing_category_url
            )
            self.click_specific_button("Op categorie")
            time.sleep(2)
            titles = self.extract_clothing_type_titles(self.CATEGORY_DROPDOWN_CSS_LIST)
            clothing_type_urls = self.create_clothing_links_from_titles(
                relative_clothing_category_url, titles
            )

            self.driver_helper.quit_driver()
            for clothing_type_url in clothing_type_urls:
                if clothing_type_url == None:
                    break
                self.driver = self.driver_helper.initialize_driver()
                GenericScraperHelper.set_driver_for_page_url(
                    self.driver, clothing_type_url
                )
                GenericScraperHelper.load_all_products(self.driver)
                products = self.extract_product_info(
                    str_gender=self.clothing_gender_type
                )
                print(products)
                self.all_clothing_items.append(products)
                self.driver_helper.quit_driver()

        return self.all_clothing_items

In [9]:
BASE_URL = "https://www.guess.eu/nl-be/guess"
CLOTHING_URL = "?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true"
SALE_URL = "?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&prefn2=isSale&prefv2=BE"
CLOTHING_BASE_URLS = [CLOTHING_URL, SALE_URL]
clothing_gender_types = ["dames", " heren, "]

for clothing_gender_type in clothing_gender_types:
    scraper = GuessScraper(BASE_URL=BASE_URL, clothing_gender_type=clothing_gender_type)
    scraped_data = scraper.scrape(CLOTHING_BASE_URLS)
    print(scraped_data)

[{'title': 'Strak aansluitende lange sweaterjurk', 'original_price': '160.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Lurex mini-trui-jurk', 'original_price': '160.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Mini-jurk van kant', 'original_price': '130.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Lange asymmetrische jurk', 'original_price': '230.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Lange jurk met kant', 'original_price': '180.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Lange jurk met bloemenprint', 'original_price': '200.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Strak aansluitende mini-jurk met strik op de voorkant', 'original_price': '99.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Mini-jurk van kant', 'original_price': '150.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Jurk met afneembare rok', 'original_price': '200.00 €', 'sale_price': '', 'gender': 'dames'}, {'title': 'Satijnen slip

KeyboardInterrupt: 

Step 3: Extract all titles in the category


Initialize Webdriver


In [None]:
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [None]:
BASE_URL = "https://www.guess.eu/nl-be/guess"

CLOTHING_URL = "?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true"
SALE_URL = "?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&prefn2=isSale&prefv2=BE"

Ladies


In [None]:
def set_driver_for_page_url(relative_category_url, driver):
    driver.get(relative_category_url)
    time.sleep(5)

In [None]:
# Women, Men, Children
def construct_group_category_url(base_url, target_group, clothing_url):
    return f"{base_url}/{target_group}/kleding/{clothing_url}"

In [None]:
def construct_clothing_type_category_url(group_category_url, clothing_type):
    modified_url = re.sub(r"(kleding)", f"\\1/{clothing_type}", group_category_url)
    return modified_url

In [None]:
url = construct_group_category_url(BASE_URL, "dames", CLOTHING_URL)
print(url)

https://www.guess.eu/nl-be/guess/dames/kleding/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true


In [None]:
construct_clothing_type_category_url(url, "tshirts")

'https://www.guess.eu/nl-be/guess/dames/kleding/tshirts/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true'

In [None]:
set_driver_for_page_url(url, driver)

In [None]:
def click_specific_button(driver, button_text):
    attempts = 3
    for attempt in range(attempts):
        try:
            button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (
                        By.XPATH,
                        f"//div[contains(@class, 'refinements__item-button') and .//span[text()='{button_text}']]",
                    )
                )
            )
            driver.execute_script("arguments[0].scrollIntoView();", button)
            clickable_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(
                    (
                        By.XPATH,
                        f"//div[contains(@class, 'refinements__item-button') and .//span[text()='{button_text}']]",
                    )
                )
            )
            driver.execute_script("arguments[0].click();", clickable_button)
            return
        except WebDriverException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(5)
    raise Exception("Failed to click the dropdown button after several attempts")

In [None]:
def extract_titles(driver):
    titles = []
    try:
        # Locate the dropdown list
        dropdown_list = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "ul.refinements__attribute-wrapper")
            )
        )
        # Extract all titles
        items = dropdown_list.find_elements(
            By.CSS_SELECTOR, "li.refinements__attribute"
        )
        for item in items:
            title = item.get_attribute("title")
            titles.append(title)
    except Exception as e:
        print(f"Exception occurred while extracting titles: {e}")
    return titles

In [None]:
click_specific_button(driver, "Op categorie")
time.sleep(2)

In [None]:
CATEGORY_DROPDOWN_CSS_LIST = (
    "div.mb-lg-0.filters-container__item.refinements__item--category "
    "div.js-collapse.refinements__wrapper.collapse.show div.filters-content "
    "ul.refinements__attribute-wrapper.values.content.pl-0.m-lg-0 "
    "li.refinements__attribute"
)

In [None]:
def extract_titles(driver, css_dropdown_list):
    titles = []
    try:
        # Locate the list items within the dropdown
        list_items = driver.find_elements(By.CSS_SELECTOR, css_dropdown_list)
        for item in list_items:
            title = item.get_attribute("title")
            if title:
                titles.append(title)
    except Exception as e:
        print(f"Exception occurred while extracting titles: {e}")
    return titles


titles = extract_titles(driver, CATEGORY_DROPDOWN_CSS_LIST)

In [None]:
def convert_to_dash_format(phrase):
    cleaned_phrase = phrase.lower().strip()
    dash_format = cleaned_phrase.replace(" ", "-").lower()
    return dash_format

In [None]:
convert_to_dash_format("Jurken en Jumpsuits")

'jurken-en-jumpsuits'

In [None]:
titles

['Jurken en Jumpsuits',
 'T-shirts',
 'Tops en Overhemden',
 'Gebreide Kleding',
 'Sweatshirts',
 'Jasjes',
 'Broeken',
 'Rokken en Shorts',
 'Pakken',
 'Strandkleding',
 'Sportkleding']

In [None]:
for title in titles:
    title = convert_to_dash_format(title)
    print(construct_clothing_type_category_url(url, title))

https://www.guess.eu/nl-be/guess/dames/kleding/jurken-en-jumpsuits/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true
https://www.guess.eu/nl-be/guess/dames/kleding/t-shirts/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true
https://www.guess.eu/nl-be/guess/dames/kleding/tops-en-overhemden/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true
https://www.guess.eu/nl-be/guess/dames/kleding/gebreide-kleding/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true
https://www.guess.eu/nl-be/guess/dames/kleding/sweatshirts/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true
https://www.guess.eu/nl-be/guess/dames/kleding/jasjes/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true
https://www.guess.eu/nl-be/guess/dames/kleding/broeken/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true
https://www.guess.eu/nl-be/guess/dames/kleding/rokken-en-shorts/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true
https://www.guess.e

Extact using Beautiful soup per page; implement continuous scrolling


In [None]:
# Jurken en jumpsuits
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
url = "https://www.guess.eu/nl-be/guess/dames/kleding/jurken-en-jumpsuits/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true"
set_driver_for_page_url(url, driver=driver)

In [None]:
# scroll the page
def load_all_products(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [None]:
load_all_products(driver)

In [None]:
# soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
def extract_product_info(driver, str_gender):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    products = []
    try:
        description_divs = soup.select("div.small-description")
        for grid in description_divs:
            product_divs = grid.select("div.pdp-link.product-tile__pdp-link")
            for div in product_divs:
                title_tag = div.find("a", class_="link product-tile__link js-tile-link")
                price_tag = div.find_next("span", class_="value price__value")
                if title_tag and price_tag:
                    product_info = {
                        "title": title_tag.get_text(strip=True),
                        "price": price_tag.get_text(strip=True),
                        "gender": str_gender,
                    }
                    products.append(product_info)
    except Exception as e:
        print(f"Exception occurred while extracting product information: {e}")
    return products

In [None]:
products = extract_product_info(driver, "dames")

Discounts


In [None]:
# Site for the sales
# https://www.guess.eu/nl-be/guess/dames/kleding?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&prefn2=isSale&prefv2=BE
# https://www.guess.eu/nl-be/guess/heren/kleding/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true

In [None]:
url = construct_group_category_url(BASE_URL, "dames", CLOTHING_URL)
print(url)

https://www.guess.eu/nl-be/guess/dames/kleding/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true


In [None]:
from urllib.parse import urlencode, urlparse, urlunparse, parse_qs


def construct_sales_url(base_url, params):
    url_parts = list(urlparse(base_url))
    query = parse_qs(url_parts[4])
    query.update(params)
    url_parts[4] = urlencode(query, doseq=True)
    return urlunparse(url_parts)


params = {"prefn2": "isSale", "prefv2": "BE"}

sales_url = construct_sales_url(url, params)
print(sales_url)

https://www.guess.eu/nl-be/guess/dames/kleding/?prefn1=guess_visibleInCountries&prefv1=BE%7CALL&psubcat=true&prefn2=isSale&prefv2=BE


In [None]:
load_all_products(driver)

In [None]:
def generate_clothing_urls(base_url, titles):
    """Generates a list of URLs for each clothing type in titles."""
    urls = []
    for title in titles:
        dash_title = convert_to_dash_format(title)
        clothing_url = construct_clothing_type_category_url(base_url, dash_title)
        urls.append(clothing_url)
    return urls

In [None]:
sales_url = "https://www.guess.eu/nl-be/guess/dames/kleding/jasjes?prefn1=guess_visibleInCountries&prefn2=isSale&prefv1=BE%7CALL&prefv2=BE&psubcat=true"
from bs4 import BeautifulSoup


from bs4 import BeautifulSoup


def get_soup(driver):
    return BeautifulSoup(driver.page_source, "html.parser")


def extract_price_info(price_container):
    original_price_tag = price_container.find("span", class_="price__strike-through")
    sale_price_tag = price_container.find("span", class_="value price__value--sale")

    original_price = (
        original_price_tag.get_text(strip=True) if original_price_tag else ""
    )
    sale_price = sale_price_tag.get_text(strip=True) if sale_price_tag else ""

    return original_price, sale_price


def extract_title_info(product_div):
    title_tag = product_div.find("a", class_="link product-tile__link js-tile-link")
    return title_tag.get_text(strip=True) if title_tag else ""


def extract_product_info(driver, str_gender):
    soup = get_soup(driver)
    products = []
    try:
        description_divs = soup.select("div.small-description")
        for grid in description_divs:
            product_divs = grid.select("div.pdp-link.product-tile__pdp-link")
            for div in product_divs:
                title = extract_title_info(div)
                price_container = div.find_next("div", class_="product__price")

                if price_container:
                    original_price, sale_price = extract_price_info(price_container)
                    product_info = {
                        "title": title,
                        "original_price": original_price,
                        "sale_price": sale_price,
                        "gender": str_gender,
                    }
                    products.append(product_info)
    except Exception as e:
        print(f"Exception occurred while extracting product information: {e}")
    return products

In [None]:
# use the constructed URL to navigate to the driver:
# Jurken en jumpsuits
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
set_driver_for_page_url(sales_url, driver=driver)
load_all_products(driver)

In [None]:
sales = extract_product_info(driver, "dames")

In [None]:
sales

[{'title': 'Imitatieleren jas',
  'original_price': '180.00 €',
  'sale_price': '',
  'gender': 'dames'}]

Creation for a class
