In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# Function to attempt element location with retries using CSS selector
def attempt_element_locate(selector, retries=5, wait_time=30):
    attempt = 0
    while attempt < retries:
        try:
            element = WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            return element
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
    raise Exception(f"Failed to locate element with CSS Selector: {selector} after {retries} attempts")

# Function to extract text from an element using a CSS selector
def extract_text_from_selector(css_selector, retries=5, wait_time=30):
    try:
        element = attempt_element_locate(css_selector, retries, wait_time)
        return element.text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "Text not found"

# Function to extract image URL using a CSS selector
def extract_image_url_from_selector(css_selector, retries=5, wait_time=30):
    try:
        img_element = attempt_element_locate(css_selector, retries, wait_time)
        return img_element.get_attribute('src')
    except Exception as e:
        print(f"Error extracting image URL: {e}")
        return "Image URL not found"

# Function to extract all image URLs from a parent div
def extract_all_images_from_div(parent_css_selector, retries=5, wait_time=30):
    try:
        parent_div = attempt_element_locate(parent_css_selector, retries, wait_time)
        img_elements = parent_div.find_elements(By.TAG_NAME, 'img')
        image_urls = [img.get_attribute('src') for img in img_elements if img.get_attribute('src')]
        return image_urls
    except Exception as e:
        print(f"Error extracting images from div: {e}")
        return []

# Function to download an image from a URL and save it as PNG
def download_image(image_url, output_path):
    try:
        print(f"Attempting to download image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading image: {e}")

# Function to download a file from a URL
def download_file(file_url, output_path):
    try:
        print(f"Attempting to download file from: {file_url}")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# URL of the page you want to scrape
url = 'https://www.molteni.it/ap/product/d1542'

# Open the product page
driver.get(url)

# Define the CSS selectors
h1_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element > a > h1'
img_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-img > div.animation-mask.p81 > div > a > img'
h3_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > h3'
div_selector_1 = '#block-0 > div > div'
div_selector_2 = '#block-2 > div > div'
img_selector_2 = '#block-4 > div > div > div.col-8.block-text-img-img.is-768 > div > img'
div_selector_3 = '#block-5 > div > div'
file_selector = '#specs > div > div.row.product-specs-row > div:nth-child(3) > a'

# Extract the <h1> text
h1_text = extract_text_from_selector(h1_selector)
print(f"Extracted <h1> text: {h1_text}")

# Create directory based on <h1> text
h1_folder_name = h1_text.replace('/', '-').replace('\\', '-')  # Sanitize folder name
product_folder_path = os.path.join(h1_folder_name)
if not os.path.exists(product_folder_path):
    os.makedirs(product_folder_path)

# Extract the image URL from the primary image selector
img_url = extract_image_url_from_selector(img_selector)
print(f"Extracted image URL: {img_url}")

# Extract the <h3> text
h3_text = extract_text_from_selector(h3_selector)
print(f"Extracted <h3> text: {h3_text}")

# Extract all image URLs from the specified divs
image_urls_div_1 = extract_all_images_from_div(div_selector_1)
print(f"Extracted image URLs from div #block-0: {image_urls_div_1}")

image_urls_div_2 = extract_all_images_from_div(div_selector_2)
print(f"Extracted image URLs from div #block-2: {image_urls_div_2}")

# Extract the image URL from #block-4
img_url_2 = extract_image_url_from_selector(img_selector_2)
print(f"Extracted image URL from #block-4: {img_url_2}")

# Extract all image URLs from #block-5
image_urls_div_3 = extract_all_images_from_div(div_selector_3)
print(f"Extracted image URLs from div #block-5: {image_urls_div_3}")

# List all image URLs to download
all_image_urls = [img_url] + [img_url_2] + image_urls_div_1 + image_urls_div_2 + image_urls_div_3
print(f"All image URLs to download: {all_image_urls}")

# Download images directly in the product folder
for idx, img_url in enumerate(all_image_urls):
    if img_url and img_url.startswith('http'):
        img_file_path = os.path.join(product_folder_path, f'image_{idx + 1}.png')
        download_image(img_url, img_file_path)

# Extract file download link and download the file
file_element = attempt_element_locate(file_selector)
if file_element:
    file_link = file_element.get_attribute('href')
    if file_link and file_link.startswith('http'):
        print(f"File link extracted: {file_link}")
        download_file(file_link, os.path.join(product_folder_path, 'description.pdf'))
    else:
        print("No valid file link found.")
else:
    print("File element not found.")

# Close the browser
driver.quit()


In [None]:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL to scrape
url = 'https://www.molteni.it/ap/highlights'

# Open the webpage
driver.get(url)

try:
    # Wait until the section containing the articles is loaded
    wait = WebDriverWait(driver, 10)
    section = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list')))
    
    # Find all article tags within the section
    articles = section.find_elements(By.TAG_NAME, 'article')
    
    # Loop through each article tag and find all <a> tags inside it
    for article in articles:
        links = article.find_elements(By.TAG_NAME, 'a')
        for link in links:
            # Print the href attribute of each <a> tag (which contains the URL)
            print(link.get_attribute('href'))

except TimeoutException:
    print("Loading the section took too long.")
except NoSuchElementException:
    print("Could not find the required elements on the page.")
finally:
    # Close the browser
    driver.quit()


In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL of the page you want to scrape (all products page)
main_url = 'https://www.molteni.it/ap/highlights'

# Function to attempt element location with retries using CSS selector
def attempt_element_locate(driver, selector, retries=5, wait_time=30):
    attempt = 0
    while attempt < retries:
        try:
            element = WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            return element
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
    raise Exception(f"Failed to locate element with CSS Selector: {selector} after {retries} attempts")

# Function to extract all product links
def get_product_links():
    driver.get(main_url)
    try:
        # Wait until the section containing the articles is loaded
        wait = WebDriverWait(driver, 10)
        section = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list')))
        
        # Find all article tags within the section
        articles = section.find_elements(By.TAG_NAME, 'article')
        product_links = []
        
        # Loop through each article tag and find all <a> tags inside it
        for article in articles:
            links = article.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href:
                    product_links.append(href)
        
        return product_links

    except TimeoutException:
        print("Loading the section took too long.")
        return []
    except NoSuchElementException:
        print("Could not find the required elements on the page.")
        return []

# Function to scrape product details from each product page
def scrape_product_page(product_url):
    driver.get(product_url)
    
    # Define the CSS selectors
    h1_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element > a > h1'
    img_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-img > div.animation-mask.p81 > div > a > img'
    h3_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > h3'
    div_selector_1 = '#block-0 > div > div'
    div_selector_2 = '#block-2 > div > div'
    img_selector_2 = '#block-4 > div > div > div.col-8.block-text-img-img.is-768 > div > img'
    div_selector_3 = '#block-5 > div > div'
    file_selector = '#specs > div > div.row.product-specs-row > div:nth-child(3) > a'

    # Extract the <h1> text
    h1_text = extract_text_from_selector(driver, h1_selector)
    print(f"Extracted <h1> text: {h1_text}")

    # Create directory based on <h1> text
    h1_folder_name = h1_text.replace('/', '-').replace('\\', '-')  # Sanitize folder name
    product_folder_path = os.path.join(h1_folder_name)
    if not os.path.exists(product_folder_path):
        os.makedirs(product_folder_path)

    # Extract the image URL from the primary image selector
    img_url = extract_image_url_from_selector(driver, img_selector)
    print(f"Extracted image URL: {img_url}")

    # Extract the <h3> text
    h3_text = extract_text_from_selector(driver, h3_selector)
    print(f"Extracted <h3> text: {h3_text}")

    # Extract all image URLs from the specified divs
    image_urls_div_1 = extract_all_images_from_div(driver, div_selector_1)
    image_urls_div_2 = extract_all_images_from_div(driver, div_selector_2)
    img_url_2 = extract_image_url_from_selector(driver, img_selector_2)
    image_urls_div_3 = extract_all_images_from_div(driver, div_selector_3)

    # List all image URLs to download
    all_image_urls = [img_url] + [img_url_2] + image_urls_div_1 + image_urls_div_2 + image_urls_div_3
    print(f"All image URLs to download: {all_image_urls}")

    # Download images directly in the product folder
    for idx, img_url in enumerate(all_image_urls):
        if img_url and img_url.startswith('http'):
            img_file_path = os.path.join(product_folder_path, f'image_{idx + 1}.png')
            download_image(img_url, img_file_path)

    # Extract file download link and download the file
    file_element = attempt_element_locate(driver, file_selector)
    if file_element:
        file_link = file_element.get_attribute('href')
        if file_link and file_link.startswith('http'):
            download_file(file_link, os.path.join(product_folder_path, 'description.pdf'))
        else:
            print("No valid file link found.")
    else:
        print("File element not found.")

# Helper functions for element extraction, image download, and file download
def extract_text_from_selector(driver, css_selector, retries=5, wait_time=30):
    try:
        element = attempt_element_locate(driver, css_selector, retries, wait_time)
        return element.text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "Text not found"

def extract_image_url_from_selector(driver, css_selector, retries=5, wait_time=30):
    try:
        img_element = attempt_element_locate(driver, css_selector, retries, wait_time)
        return img_element.get_attribute('src')
    except Exception as e:
        print(f"Error extracting image URL: {e}")
        return "Image URL not found"

def extract_all_images_from_div(driver, parent_css_selector, retries=5, wait_time=30):
    try:
        parent_div = attempt_element_locate(driver, parent_css_selector, retries, wait_time)
        img_elements = parent_div.find_elements(By.TAG_NAME, 'img')
        image_urls = [img.get_attribute('src') for img in img_elements if img.get_attribute('src')]
        return image_urls
    except Exception as e:
        print(f"Error extracting images from div: {e}")
        return []

def download_image(image_url, output_path):
    try:
        print(f"Attempting to download image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading image: {e}")

def download_file(file_url, output_path):
    try:
        print(f"Attempting to download file from: {file_url}")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Main script execution
product_links = get_product_links()

if product_links:
    for product_link in product_links:
        scrape_product_page(product_link)
else:
    print("No product links found.")

# Close the browser
driver.quit()


In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL of the page you want to scrape (all products page)
main_url = 'https://www.molteni.it/ap/highlights'

# Function to attempt element location with retries using CSS selector
def attempt_element_locate(driver, selector, retries=5, wait_time=30):
    attempt = 0
    while attempt < retries:
        try:
            element = WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            return element
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
    raise Exception(f"Failed to locate element with CSS Selector: {selector} after {retries} attempts")

# Function to extract all product links
def get_product_links():
    driver.get(main_url)
    try:
        # Wait until the section containing the articles is loaded
        wait = WebDriverWait(driver, 10)
        section = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list')))
        
        # Find all article tags within the section
        articles = section.find_elements(By.TAG_NAME, 'article')
        product_links = []
        
        # Loop through each article tag and find all <a> tags inside it
        for article in articles:
            links = article.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href:
                    product_links.append(href)
        
        return product_links

    except TimeoutException:
        print("Loading the section took too long.")
        return []
    except NoSuchElementException:
        print("Could not find the required elements on the page.")
        return []

# Function to scrape product details from each product page
def scrape_product_page(product_url):
    driver.get(product_url)
    
    # Define the CSS selectors
    h1_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element > a > h1'
    h3_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > h3'
    file_selector = '#specs > div > div.row.product-specs-row > div:nth-child(3) > a'

    # Extract the <h1> text
    h1_text = extract_text_from_selector(driver, h1_selector)
    print(f"Extracted <h1> text: {h1_text}")

    # Sanitize folder name
    h1_folder_name = h1_text.replace('/', '-').replace('\\', '-').replace(':', '-').replace('*', '-').replace('?', '-').replace('"', '-').replace('<', '-').replace('>', '-').replace('|', '-')
    product_folder_path = os.path.join(h1_folder_name)
    print(f"Creating directory at path: {product_folder_path}")

    try:
        if not os.path.exists(product_folder_path):
            os.makedirs(product_folder_path)
        print(f"Directory created successfully: {product_folder_path}")
    except Exception as e:
        print(f"Error creating directory: {e}")
        return

    # Extract the <h3> text
    h3_text = extract_text_from_selector(driver, h3_selector)
    print(f"Extracted <h3> text: {h3_text}")

    # Extract all image URLs from the entire page (limited to the first 18)
    all_image_urls = extract_all_images_from_page(driver)
    print(f"All image URLs to download: {all_image_urls}")

    # Download images directly in the product folder
    for idx, img_url in enumerate(all_image_urls[:18]):
        if img_url and img_url.startswith('http'):
            img_file_path = os.path.join(product_folder_path, f'image_{idx + 1}.png')
            download_image(img_url, img_file_path)

    # Extract file download link and download the file
    file_element = attempt_element_locate(driver, file_selector)
    if file_element:
        file_link = file_element.get_attribute('href')
        if file_link and file_link.startswith('http'):
            download_file(file_link, os.path.join(product_folder_path, 'description.pdf'))
        else:
            print("No valid file link found.")
    else:
        print("File element not found.")

# Helper functions for element extraction, image download, and file download
def extract_text_from_selector(driver, css_selector, retries=5, wait_time=30):
    try:
        element = attempt_element_locate(driver, css_selector, retries, wait_time)
        return element.text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "Text not found"

def extract_all_images_from_page(driver, retries=5, wait_time=30):
    try:
        # Retrieve all img elements on the page
        img_elements = driver.find_elements(By.TAG_NAME, 'img')
        image_urls = [img.get_attribute('src') for img in img_elements if img.get_attribute('src')]
        return image_urls
    except Exception as e:
        print(f"Error extracting images from page: {e}")
        return []

def download_image(image_url, output_path):
    try:
        print(f"Attempting to download image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading image: {e}")

def download_file(file_url, output_path):
    try:
        print(f"Attempting to download file from: {file_url}")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Main script execution
product_links = get_product_links()

if product_links:
    for product_link in product_links:
        scrape_product_page(product_link)
else:
    print("No product links found.")

# Close the browser
driver.quit()


In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL of the page you want to scrape (all products page)
main_url = 'https://www.molteni.it/ap/highlights'

# Function to attempt element location with retries using CSS selector
def attempt_element_locate(driver, selector, retries=5, wait_time=30):
    attempt = 0
    while attempt < retries:
        try:
            element = WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            return element
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
    raise Exception(f"Failed to locate element with CSS Selector: {selector} after {retries} attempts")

# Function to extract all product links
def get_product_links():
    driver.get(main_url)
    try:
        # Wait until the section containing the articles is loaded
        wait = WebDriverWait(driver, 10)
        section = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list')))
        
        # Find all article tags within the section
        articles = section.find_elements(By.TAG_NAME, 'article')
        product_links = []
        
        # Loop through each article tag and find all <a> tags inside it
        for article in articles:
            links = article.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href:
                    product_links.append(href)
        
        return product_links

    except TimeoutException:
        print("Loading the section took too long.")
        return []
    except NoSuchElementException:
        print("Could not find the required elements on the page.")
        return []

# Function to scrape product details from each product page
def scrape_product_page(product_url):
    driver.get(product_url)
    
    # Define the CSS selectors
    h1_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element > a > h1'
    h3_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > h3'
    file_selector = '#specs > div > div.row.product-specs-row > div:nth-child(3) > a'

    # Extract the <h1> text
    h1_text = extract_text_from_selector(driver, h1_selector)
    print(f"Extracted <h1> text: {h1_text}")

    # Sanitize folder name
    h1_folder_name = h1_text.replace('/', '-').replace('\\', '-').replace(':', '-').replace('*', '-').replace('?', '-').replace('"', '-').replace('<', '-').replace('>', '-').replace('|', '-')
    product_folder_path = os.path.join(h1_folder_name)
    print(f"Creating directory at path: {product_folder_path}")

    try:
        if not os.path.exists(product_folder_path):
            os.makedirs(product_folder_path)
        print(f"Directory created successfully: {product_folder_path}")
    except Exception as e:
        print(f"Error creating directory: {e}")
        return

    # Extract the <h3> text
    h3_text = extract_text_from_selector(driver, h3_selector)
    print(f"Extracted <h3> text: {h3_text}")

    # Extract all image URLs from the page (limited to the first 18)
    all_image_urls = extract_images_between_sections(driver, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div', '#block-14 > div')
    print(f"All image URLs to download: {all_image_urls}")

    # Download images directly in the product folder
    for idx, img_url in enumerate(all_image_urls[:18]):
        if img_url and img_url.startswith('http'):
            img_file_path = os.path.join(product_folder_path, f'image_{idx + 1}.png')
            download_image(img_url, img_file_path)

    # Extract file download link and download the file
    file_element = attempt_element_locate(driver, file_selector)
    if file_element:
        file_link = file_element.get_attribute('href')
        if file_link and file_link.startswith('http'):
            download_file(file_link, os.path.join(product_folder_path, 'description.pdf'))
        else:
            print("No valid file link found.")
    else:
        print("File element not found.")

# Helper functions for element extraction, image download, and file download
def extract_text_from_selector(driver, css_selector, retries=5, wait_time=30):
    try:
        element = attempt_element_locate(driver, css_selector, retries, wait_time)
        return element.text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "Text not found"

def extract_images_between_sections(driver, start_selector, end_selector, retries=5, wait_time=30):
    try:
        # Retrieve the start and end elements
        start_element = attempt_element_locate(driver, start_selector, retries, wait_time)
        end_element = attempt_element_locate(driver, end_selector, retries, wait_time)

        # Find all img elements on the page
        img_elements = driver.find_elements(By.TAG_NAME, 'img')
        image_urls = []
        start_found = False
        end_found = False

        for img in img_elements:
            # Check if the image is located within the start section
            parent_element = img.find_element(By.XPATH, '..')  # Find the parent of the img
            parent_id = parent_element.get_attribute('id')

            if start_element in parent_element.find_elements(By.XPATH, '..'):
                start_found = True

            if start_found and not end_found:
                # Add image URL to the list
                image_url = img.get_attribute('src')
                if image_url:
                    image_urls.append(image_url)

            if end_element in parent_element.find_elements(By.XPATH, '..'):
                end_found = True
                break

        return image_urls
    except Exception as e:
        print(f"Error extracting images from page: {e}")
        return []

def download_image(image_url, output_path):
    try:
        print(f"Attempting to download image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading image: {e}")

def download_file(file_url, output_path):
    try:
        print(f"Attempting to download file from: {file_url}")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Main script execution
product_links = get_product_links()

if product_links:
    for product_link in product_links:
        scrape_product_page(product_link)
else:
    print("No product links found.")

# Close the browser
driver.quit()


In [None]:
# this code is working for one product only in second category 
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL of the page you want to scrape (specific product page)
main_url = 'https://www.molteni.it/ap/product/intersection'

# Open the webpage
driver.get(main_url)

def download_image(image_url, folder_path, image_name):
    try:
        # Get image content
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors

        # Define the path where the image will be saved
        image_path = os.path.join(folder_path, image_name)

        # Save the image
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved image: {image_name}")
    
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_file(file_url, folder_path, file_name):
    try:
        # Get file content
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors

        # Define the path where the file will be saved
        file_path = os.path.join(folder_path, file_name)

        # Save the file
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_images_from_selector(selector, folder_path, image_counter):
    try:
        # Wait for the containers to be present
        container_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        
        # Extract images from all containers
        for container_div in container_divs:
            # Find all images inside the container
            all_images = container_div.find_elements(By.TAG_NAME, "img")
            
            # Loop through each found image and get the 'src' attribute
            for img in all_images:
                image_src = img.get_attribute('src')
                image_name = f"image_{image_counter}.png"
                download_image(image_src, folder_path, image_name)
                image_counter += 1
    
    except (NoSuchElementException, TimeoutException):
        print(f"No images found inside the container {selector} or timeout occurred.")
    
    return image_counter

try:
    # Extract text from h1 tag inside a specific div
    text_selector = "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element"
    
    try:
        # Wait for the div to be present
        info_div = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, text_selector))
        )
        
        # Find the <a> tag inside the div
        a_tag = info_div.find_element(By.TAG_NAME, "a")
        
        # Find the <h1> tag inside the <a> tag and get its text
        h1_tag = a_tag.find_element(By.TAG_NAME, "h1")
        h1_text = h1_tag.text.strip()
        print("H1 Text:", h1_text)
        
        # Create a directory with the H1 text as its name
        folder_path = os.path.join(os.getcwd(), h1_text.replace('/', '_').replace('\\', '_'))  # Replace invalid characters
        os.makedirs(folder_path, exist_ok=True)
    
    except (NoSuchElementException, TimeoutException):
        print("H1 tag not found inside the specified container or timeout occurred.")
        folder_path = os.getcwd()  # Use current directory if H1 text not found

    # Extract images from all specified selectors and save to the created folder
    selectors = [
        "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1)",
        "#block-0",
        "#block-1",
        "#block-2",
        "#block-3",
        "#block-4",
        "#block-5",
        "#block-6",
        "#block-7",
        "#block-8",
        "#block-9",
        "#block-10",
        "#block-11"
    ]
    
    image_counter = 1
    for selector in selectors:
        image_counter = extract_images_from_selector(selector, folder_path, image_counter)

    # Download the file from the specified anchor tag
    anchor_selector = "#specs > div > div.row.product-specs-row > div:nth-child(3) > a"
    
    try:
        # Wait for the anchor tag to be present
        anchor_tag = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, anchor_selector))
        )
        
        # Get the URL from the href attribute
        file_url = anchor_tag.get_attribute('href')
        file_name = "description.pdf"  # Set filename for the downloaded file
        
        # Download the file
        download_file(file_url, folder_path, file_name)
    
    except (NoSuchElementException, TimeoutException):
        print("Anchor tag not found inside the specified container or timeout occurred.")

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser window
    driver.quit()


In [None]:
# This code is working for Kitchen category and will get all products
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def download_image(image_url, folder_path, image_name):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        image_path = os.path.join(folder_path, image_name)
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved image: {image_name}")
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_images_from_selector(selector, folder_path, image_counter):
    try:
        container_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        for container_div in container_divs:
            all_images = container_div.find_elements(By.TAG_NAME, "img")
            for img in all_images:
                image_src = img.get_attribute('src')
                image_name = f"image_{image_counter}.png"
                download_image(image_src, folder_path, image_name)
                image_counter += 1
    except (NoSuchElementException, TimeoutException):
        print(f"No images found inside the container {selector} or timeout occurred.")
    return image_counter

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        product_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list > article > a'))
        )
        for element in product_elements:
            link = element.get_attribute('href')
            if link:
                links.append(link)
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def scrape_product_page(product_url):
    driver.get(product_url)
    try:
        text_selector = "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element"
        try:
            info_div = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, text_selector))
            )
            a_tag = info_div.find_element(By.TAG_NAME, "a")
            h1_tag = a_tag.find_element(By.TAG_NAME, "h1")
            h1_text = h1_tag.text.strip()
            print("H1 Text:", h1_text)
            folder_path = os.path.join(os.getcwd(), h1_text.replace('/', '_').replace('\\', '_'))
            os.makedirs(folder_path, exist_ok=True)
        except (NoSuchElementException, TimeoutException):
            print("H1 tag not found inside the specified container or timeout occurred.")
            folder_path = os.getcwd()

        selectors = [
            "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1)",
            "#block-0",
            "#block-1",
            "#block-2",
            "#block-3",
            "#block-4",
            "#block-5",
            "#block-6",
            "#block-7",
            "#block-8",
            "#block-9",
            "#block-10",
            "#block-11"
        ]
        image_counter = 1
        for selector in selectors:
            image_counter = extract_images_from_selector(selector, folder_path, image_counter)

        anchor_selector = "#specs > div > div.row.product-specs-row > div:nth-child(3) > a"
        try:
            anchor_tag = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, anchor_selector))
            )
            file_url = anchor_tag.get_attribute('href')
            file_name = "description.pdf"
            download_file(file_url, folder_path, file_name)
        except (NoSuchElementException, TimeoutException):
            print("Anchor tag not found inside the specified container or timeout occurred.")

    except Exception as e:
        print(f"An error occurred while processing the product page {product_url}: {e}")

try:
    base_url = 'https://www.molteni.it/ap/kitchens/category/highlights'
    product_links = get_product_links(base_url)
    for link in product_links:
        scrape_product_page(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:

import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def download_image(image_url, folder_path, image_name):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        image_path = os.path.join(folder_path, image_name)
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved image: {image_name}")
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_images_from_selector(selector, folder_path, image_counter):
    try:
        container_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        for container_div in container_divs:
            all_images = container_div.find_elements(By.TAG_NAME, "img")
            for img in all_images:
                image_src = img.get_attribute('src')
                image_name = f"image_{image_counter}.png"
                download_image(image_src, folder_path, image_name)
                image_counter += 1
    except (NoSuchElementException, TimeoutException):
        print(f"No images found inside the container {selector} or timeout occurred.")
    return image_counter

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        product_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list > article > a'))
        )
        for element in product_elements:
            link = element.get_attribute('href')
            if link:
                links.append(link)
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def scrape_product_page(product_url):
    driver.get(product_url)
    try:
        text_selector = "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element"
        try:
            info_div = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, text_selector))
            )
            a_tag = info_div.find_element(By.TAG_NAME, "a")
            h1_tag = a_tag.find_element(By.TAG_NAME, "h1")
            h1_text = h1_tag.text.strip()
            print("H1 Text:", h1_text)
            folder_path = os.path.join(os.getcwd(), h1_text.replace('/', '_').replace('\\', '_'))
            os.makedirs(folder_path, exist_ok=True)
        except (NoSuchElementException, TimeoutException):
            print("H1 tag not found inside the specified container or timeout occurred.")
            folder_path = os.getcwd()

        selectors = [
            "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1)",
            "#block-0",
            "#block-1",
            "#block-2",
            "#block-3",
            "#block-4",
            "#block-5",
            "#block-6",
            "#block-7",
            "#block-8",
            "#block-9",
            "#block-10",
            "#block-11"
        ]
        image_counter = 1
        for selector in selectors:
            image_counter = extract_images_from_selector(selector, folder_path, image_counter)

        anchor_selector = "#specs > div > div.row.product-specs-row > div:nth-child(3) > a"
        try:
            anchor_tag = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, anchor_selector))
            )
            file_url = anchor_tag.get_attribute('href')
            file_name = "description.pdf"
            download_file(file_url, folder_path, file_name)
        except (NoSuchElementException, TimeoutException):
            print("Anchor tag not found inside the specified container or timeout occurred.")

    except Exception as e:
        print(f"An error occurred while processing the product page {product_url}: {e}")

try:
    base_url = 'https://www.molteni.it/ap/gio-ponti/category/highlights'
    product_links = get_product_links(base_url)
    for link in product_links:
        scrape_product_page(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:
import os
import re
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def sanitize_filename(filename):
    # Remove any characters that are invalid in Windows file names and trim whitespace
    filename = re.sub(r'[<>:"/\\|?*\n\r]', '', filename).strip()
    return filename

def download_image(image_url, folder_path, image_name):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        image_path = os.path.join(folder_path, image_name)
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved image: {image_name}")
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_images_from_selector(selector, folder_path, image_counter):
    try:
        container_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        for container_div in container_divs:
            all_images = container_div.find_elements(By.TAG_NAME, "img")
            for img in all_images:
                image_src = img.get_attribute('src')
                if image_src:
                    image_name = f"image_{image_counter}.png"
                    download_image(image_src, folder_path, image_name)
                    image_counter += 1
    except (NoSuchElementException, TimeoutException):
        print(f"No images found inside the container {selector} or timeout occurred.")
    return image_counter

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        # Use the provided selector to locate the container with multiple tags
        container_selector = "#c25957 > div > section.container.prv-list.show-sofa-1"
        
        # Wait for the container to be present and extract anchor elements from it
        container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, container_selector))
        )
        
        # Find all <a> tags within the container
        product_elements = container.find_elements(By.TAG_NAME, 'a')
        
        # Extract the URLs from the href attributes of each <a> tag
        for element in product_elements:
            link = element.get_attribute('href')
            if link:
                links.append(link)
                print(f"Product link found: {link}")
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def scrape_product_page(product_url):
    driver.get(product_url)
    try:
            
        # Use the new H1 selector for folder naming
        h1_selector = "#c205373 > div > section > div > div > h1"
        try:
            h1_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, h1_selector))
            )
            h1_text = h1_element.text.strip()
            print("H1 Text (Folder Name):", h1_text)
            folder_name = sanitize_filename(h1_text)
            folder_path = os.path.join(os.getcwd(), folder_name)
            os.makedirs(folder_path, exist_ok=True)
        except (NoSuchElementException, TimeoutException):
            print("H1 tag not found or timeout occurred.")
            folder_path = os.getcwd()  # Default to current directory

        # New selectors for images
        selectors = [
            "#c206647",  # First container with multiple tags with images
            "body > main > section:nth-child(8)",  # Second container with multiple divs containing images
            "body > main > section:nth-child(9)"   # Third container with multiple divs containing images
        ]
        
        # Extract images from new selectors
        image_counter = 1
        for selector in selectors:
            image_counter = extract_images_from_selector(selector, folder_path, image_counter)

        # New selector for file download
        file_selector = "#c205373 > div > section > div > div > ul:nth-child(4) > li:nth-child(2) > a"
        try:
            anchor_tag = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, file_selector))
            )
            file_url = anchor_tag.get_attribute('href')
            file_name = "description.pdf"
            download_file(file_url, folder_path, file_name)
        except (NoSuchElementException, TimeoutException):
            print("File download link not found or timeout occurred.")

    except Exception as e:
        print(f"An error occurred while processing the product page {product_url}: {e}")

try:
    base_url = 'https://www.rolf-benz.com/en_OC/furniture/sofas'
    product_links = get_product_links(base_url)
    print(f"Total product links found: {len(product_links)}")
    
    # Visit each product page and scrape the required data
    for link in product_links:
        scrape_product_page(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [11]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        # Use the provided selector to locate the container with multiple tags
        container_selector = "#c25957 > div > section.container.prv-list.show-sofa-1"
        
        # Wait for the container to be present and extract anchor elements from it
        container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, container_selector))
        )
        
        # Find all <a> tags within the container
        product_elements = container.find_elements(By.TAG_NAME, 'a')
        
        # Extract the URLs from the href attributes of each <a> tag
        for element in product_elements:
            link = element.get_attribute('href')
            if link:
                links.append(link)
                print(f"Product link found: {link}")
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_and_save_product_info(url):
    driver.get(url)
    try:
        # Extract text after "sofas/"
        if "sofas/" in url:
            text_after_sofas = url.split("sofas/")[1]
            # Add "rolf-benz" if not present
            if "rolf-benz" not in text_after_sofas:
                text_after_sofas = f"rolf-benz {text_after_sofas}"
            print(f"Extracted text: {text_after_sofas}")
            
            # Create a folder with the extracted text
            folder_name = text_after_sofas.replace('/', '_').replace('\\', '_')
            folder_path = os.path.join(os.getcwd(), folder_name)
            os.makedirs(folder_path, exist_ok=True)
            
            # List of XPaths to try for downloading the file
            file_xpaths = [
                "/html/body/main/div[9]/section/div/div/ul/li[1]/a",
                "/html/body/main/div[5]/div/section/div/div/ul[1]/li[2]/a",
                "/html/body/main/div[2]/div/section/div/div/ul[1]/li[1]/a",
                "/html/body/main/div[2]/div/section/div/div/ul[1]/li[2]/a",
                "/html/body/main/div[8]/section/div/div/ul/li[1]/a",
                "/html/body/main/div[2]/div/section/div/div/ul[1]/li[1]/a",
                "/html/body/main/div[8]/section/div/div/ul/li/a",
                "/html/body/main/div[6]/div/section/div/div/ul[1]/li[2]/a",
                "/html/body/main/div[2]/div/section/div/div/ul[1]/li/a",
                "/html/body/main/div[8]/section/div/div/ul/li[1]/a"
            ]
            
            file_url = None
            for xpath in file_xpaths:
                try:
                    file_element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.XPATH, xpath))
                    )
                    file_url = file_element.get_attribute('href')
                    if file_url:
                        break
                except (NoSuchElementException, TimeoutException):
                    continue
            
            if file_url:
                download_file(file_url, folder_path, 'description.pdf')
            else:
                print("File not found using any provided XPaths.")
                
        else:
            print("URL does not contain 'sofas/'.")
    except Exception as e:
        print(f"An error occurred while processing the URL {url}: {e}")

try:
    base_url = 'https://www.rolf-benz.com/en_OC/furniture/sofas'
    product_links = get_product_links(base_url)
    print(f"Total product links found: {len(product_links)}")
    
    # Visit each product page and process the information
    for link in product_links:
        extract_and_save_product_info(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/sina
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/moyo
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/mioko
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/rolf-benz-kumo
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/cara
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/ego-1
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/jola
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/linea
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/liv
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/mera
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/nuvola
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/onda
Product link found: https://www.rolf-benz.com/en_OC/furniture/sofas/plura
Product link found: https://www.rolf