In [None]:
#"C:\Program Files (x86)\chromedriver.exe"

In [None]:
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configuration
BASE_URL = "https://www.bebitalia.com/en-us/"
CHROMEDRIVER_PATH = r"C:\Program Files (x86)\chromedriver.exe"  #Update this path if needed
OUTPUT_FILE = "bebitalia_categories.json"  #JSON file to store category links

# Setup Selenium WebDriver (Headless Mode)
chrome_options = Options()
chrome_options.add_argument("--headless")  #Run without opening a browser
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=chrome_options)

def get_category_links():
    """Extract category URLs that match the class and role condition."""
    driver.get(BASE_URL)

    # Wait for category elements to load
    category_elements = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.ui-menu-item-wrapper[role='menuitem']"))
    )

    # Extract category links
    category_urls = []
    for elem in category_elements:
        category_url = elem.get_attribute("href")
        if category_url and category_url.startswith("https"):  # Ensure it's a valid link
            category_urls.append(category_url)

    return category_urls

# Run the scraping process
category_links = get_category_links()

# Save extracted links to JSON file
with open(OUTPUT_FILE, "w", encoding="utf-8") as json_file:
    json.dump(category_links, json_file, indent=4, ensure_ascii=False)

print(f"✅ Scraping complete. {len(category_links)} category links saved to {OUTPUT_FILE}")

# Close Selenium browser
driver.quit()


✅ Scraping complete. 55 category links saved to bebitalia_categories.json


In [6]:
# Configuration
DOMAIN = "https://www.bebitalia.com"
CHROMEDRIVER_PATH = r"C:\Program Files (x86)\chromedriver.exe"  # Update if needed
CATEGORY_FILE = "bebitalia_categories.json"  # JSON file containing category links
OUTPUT_FILE = "bebitalia_products.json"  # Output file for product data


In [7]:
# Setup Selenium WebDriver (Headless Mode)
def setup_driver():
    """Initialize and return a Selenium WebDriver in headless mode."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run without opening a browser
    service = Service(CHROMEDRIVER_PATH)
    return webdriver.Chrome(service=service, options=chrome_options)

driver = setup_driver()



In [None]:
# 🔹 Load category links
def load_category_links():
    """Load category links from a JSON file."""
    try:
        with open(CATEGORY_FILE, "r", encoding="utf-8") as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: {CATEGORY_FILE} not found.")
        return []



In [None]:
# Extract product links from a category page
def get_product_links(category_url):
    """Extract all product URLs from a given category page."""
    driver.get(category_url)

    try:
        # Wait for product elements to load
        product_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.product-item-photo"))
        )

        # Extract valid product links
        product_urls = [elem.get_attribute("href") for elem in product_elements if "/en-" in elem.get_attribute("href")]

        print(f"Found {len(product_urls)} products in category: {category_url}")
        return product_urls

    except Exception as e:
        print(f"Error fetching products from {category_url}: {e}")
        return []



In [10]:
# 🔹 Extract product description
def extract_description():
    """Extract product description from .maxi-desc <p> tag."""
    try:
        description_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, "maxi-desc"))
        )
        return description_element.find_element(By.TAG_NAME, "p").text.strip()
    except:
        return "No description available"



In [11]:
# 🔹 Extract technical information
def extract_technical_info():
    """Extract technical information stored inside the #technical_information div."""
    technical_info = {}
    try:
        tech_info_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.ID, "technical_information"))
        )
        paragraphs = tech_info_element.find_elements(By.TAG_NAME, "p")

        for p in paragraphs:
            try:
                strong_tag = p.find_element(By.TAG_NAME, "strong")
                key = strong_tag.text.replace("\u00a0", "").strip()  # Remove &nbsp;
                value = p.text.replace(strong_tag.text, "").replace("\u00a0", "").strip()  # Remove key from value
                technical_info[key] = value
            except:
                continue  # Skip if no strong tag is found
    except:
        technical_info = {"Technical Information": "Not available"}

    return technical_info



In [None]:
# 🔹 Extract product information from a product page
def extract_product_info(url, product_id):
    """Extract product details from an individual product page."""
    driver.get(url)

    try:
        product_info = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "product-info"))
        )

        # Extract product details
        name = product_info.find_element(By.CLASS_NAME, "product-title").text
        designer = product_info.find_element(By.CLASS_NAME, "product-designer").text
        year = product_info.find_element(By.CLASS_NAME, "product-year").text
        category = product_info.find_element(By.CLASS_NAME, "product-category").text.strip()

        # Extract product description
        description = extract_description()

        # Extract technical information
        technical_info = extract_technical_info()

        return {
            "id": product_id,
            "name": name,
            "designer": designer,
            "year": year,
            "category": category,
            "description": description,
            "technical_information": technical_info,
            "link": url
        }

    except Exception as e:
        print(f"Skipping {url} due to error: {e}")
        return None



In [None]:
# 🔹 Main Execution
def main():
    """Main function to orchestrate the scraping process."""
    category_links = load_category_links()
    if not category_links:
        print("No category links found. Exiting.")
        driver.quit()
        return

    product_data = []
    product_id = 1  # Start product ID numbering

    # Loop through each category and extract product data
    for category_url in category_links:
        print(f"Processing category: {category_url}")
        product_links = get_product_links(category_url)

        for idx, link in enumerate(product_links, start=1):
            print(f"   📌 Scraping product {idx}/{len(product_links)}: {link}")
            product_info = extract_product_info(link, product_id)
            if product_info:
                product_data.append(product_info)
                product_id += 1  # Increment product ID

    # Save extracted data to JSON
    with open(OUTPUT_FILE, "w", encoding="utf-8") as json_file:
        json.dump(product_data, json_file, indent=4, ensure_ascii=False)

    print(f"✅ Scraping complete. Data saved to {OUTPUT_FILE}")

# Run script
if __name__ == "__main__":
    main()
    driver.quit()  # Close Selenium browser


🔄 Processing category: https://www.bebitalia.com/en-us/modern-furniture.html
✅ Found 12 products in category: https://www.bebitalia.com/en-us/modern-furniture.html
   📌 Scraping product 1/12: https://www.bebitalia.com/en-us/en-madison-complementi-specchi.html
   📌 Scraping product 2/12: https://www.bebitalia.com/en-us/en-naviglio-divani.html
   📌 Scraping product 3/12: https://www.bebitalia.com/en-us/en-p60-complementi-pouf.html
   📌 Scraping product 4/12: https://www.bebitalia.com/en-us/en-us-jens-sedie.html
   📌 Scraping product 5/12: https://www.bebitalia.com/en-us/en-papilio-letti.html
   📌 Scraping product 6/12: https://www.bebitalia.com/en-us/en-outside-oggettistica.html
   📌 Scraping product 7/12: https://www.bebitalia.com/en-us/en-overscale-flames-oggettistica.html
   📌 Scraping product 8/12: https://www.bebitalia.com/en-us/en-pab-contenitori-zona-giorno.html
   📌 Scraping product 9/12: https://www.bebitalia.com/en-us/en-mirto-indoor-tavolini.html
   📌 Scraping product 10/12: h