In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# Function to attempt element location with retries using CSS selector
def attempt_element_locate(selector, retries=5, wait_time=30):
    attempt = 0
    while attempt < retries:
        try:
            element = WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            return element
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
    raise Exception(f"Failed to locate element with CSS Selector: {selector} after {retries} attempts")

# Function to extract text from an element using a CSS selector
def extract_text_from_selector(css_selector, retries=5, wait_time=30):
    try:
        element = attempt_element_locate(css_selector, retries, wait_time)
        return element.text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "Text not found"

# Function to extract image URL using a CSS selector
def extract_image_url_from_selector(css_selector, retries=5, wait_time=30):
    try:
        img_element = attempt_element_locate(css_selector, retries, wait_time)
        return img_element.get_attribute('src')
    except Exception as e:
        print(f"Error extracting image URL: {e}")
        return "Image URL not found"

# Function to extract all image URLs from a parent div
def extract_all_images_from_div(parent_css_selector, retries=5, wait_time=30):
    try:
        parent_div = attempt_element_locate(parent_css_selector, retries, wait_time)
        img_elements = parent_div.find_elements(By.TAG_NAME, 'img')
        image_urls = [img.get_attribute('src') for img in img_elements if img.get_attribute('src')]
        return image_urls
    except Exception as e:
        print(f"Error extracting images from div: {e}")
        return []

# Function to download an image from a URL and save it as PNG
def download_image(image_url, output_path):
    try:
        print(f"Attempting to download image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading image: {e}")

# Function to download a file from a URL
def download_file(file_url, output_path):
    try:
        print(f"Attempting to download file from: {file_url}")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# URL of the page you want to scrape
url = 'https://www.molteni.it/ap/product/d1542'

# Open the product page
driver.get(url)

# Define the CSS selectors
h1_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element > a > h1'
img_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-img > div.animation-mask.p81 > div > a > img'
h3_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > h3'
div_selector_1 = '#block-0 > div > div'
div_selector_2 = '#block-2 > div > div'
img_selector_2 = '#block-4 > div > div > div.col-8.block-text-img-img.is-768 > div > img'
div_selector_3 = '#block-5 > div > div'
file_selector = '#specs > div > div.row.product-specs-row > div:nth-child(3) > a'

# Extract the <h1> text
h1_text = extract_text_from_selector(h1_selector)
print(f"Extracted <h1> text: {h1_text}")

# Create directory based on <h1> text
h1_folder_name = h1_text.replace('/', '-').replace('\\', '-')  # Sanitize folder name
product_folder_path = os.path.join(h1_folder_name)
if not os.path.exists(product_folder_path):
    os.makedirs(product_folder_path)

# Extract the image URL from the primary image selector
img_url = extract_image_url_from_selector(img_selector)
print(f"Extracted image URL: {img_url}")

# Extract the <h3> text
h3_text = extract_text_from_selector(h3_selector)
print(f"Extracted <h3> text: {h3_text}")

# Extract all image URLs from the specified divs
image_urls_div_1 = extract_all_images_from_div(div_selector_1)
print(f"Extracted image URLs from div #block-0: {image_urls_div_1}")

image_urls_div_2 = extract_all_images_from_div(div_selector_2)
print(f"Extracted image URLs from div #block-2: {image_urls_div_2}")

# Extract the image URL from #block-4
img_url_2 = extract_image_url_from_selector(img_selector_2)
print(f"Extracted image URL from #block-4: {img_url_2}")

# Extract all image URLs from #block-5
image_urls_div_3 = extract_all_images_from_div(div_selector_3)
print(f"Extracted image URLs from div #block-5: {image_urls_div_3}")

# List all image URLs to download
all_image_urls = [img_url] + [img_url_2] + image_urls_div_1 + image_urls_div_2 + image_urls_div_3
print(f"All image URLs to download: {all_image_urls}")

# Download images directly in the product folder
for idx, img_url in enumerate(all_image_urls):
    if img_url and img_url.startswith('http'):
        img_file_path = os.path.join(product_folder_path, f'image_{idx + 1}.png')
        download_image(img_url, img_file_path)

# Extract file download link and download the file
file_element = attempt_element_locate(file_selector)
if file_element:
    file_link = file_element.get_attribute('href')
    if file_link and file_link.startswith('http'):
        print(f"File link extracted: {file_link}")
        download_file(file_link, os.path.join(product_folder_path, 'description.pdf'))
    else:
        print("No valid file link found.")
else:
    print("File element not found.")

# Close the browser
driver.quit()


In [None]:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL to scrape
url = 'https://www.molteni.it/ap/highlights'

# Open the webpage
driver.get(url)

try:
    # Wait until the section containing the articles is loaded
    wait = WebDriverWait(driver, 10)
    section = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list')))
    
    # Find all article tags within the section
    articles = section.find_elements(By.TAG_NAME, 'article')
    
    # Loop through each article tag and find all <a> tags inside it
    for article in articles:
        links = article.find_elements(By.TAG_NAME, 'a')
        for link in links:
            # Print the href attribute of each <a> tag (which contains the URL)
            print(link.get_attribute('href'))

except TimeoutException:
    print("Loading the section took too long.")
except NoSuchElementException:
    print("Could not find the required elements on the page.")
finally:
    # Close the browser
    driver.quit()


In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL of the page you want to scrape (all products page)
main_url = 'https://www.molteni.it/ap/highlights'

# Function to attempt element location with retries using CSS selector
def attempt_element_locate(driver, selector, retries=5, wait_time=30):
    attempt = 0
    while attempt < retries:
        try:
            element = WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            return element
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
    raise Exception(f"Failed to locate element with CSS Selector: {selector} after {retries} attempts")

# Function to extract all product links
def get_product_links():
    driver.get(main_url)
    try:
        # Wait until the section containing the articles is loaded
        wait = WebDriverWait(driver, 10)
        section = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list')))
        
        # Find all article tags within the section
        articles = section.find_elements(By.TAG_NAME, 'article')
        product_links = []
        
        # Loop through each article tag and find all <a> tags inside it
        for article in articles:
            links = article.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href:
                    product_links.append(href)
        
        return product_links

    except TimeoutException:
        print("Loading the section took too long.")
        return []
    except NoSuchElementException:
        print("Could not find the required elements on the page.")
        return []

# Function to scrape product details from each product page
def scrape_product_page(product_url):
    driver.get(product_url)
    
    # Define the CSS selectors
    h1_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element > a > h1'
    img_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-img > div.animation-mask.p81 > div > a > img'
    h3_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > h3'
    div_selector_1 = '#block-0 > div > div'
    div_selector_2 = '#block-2 > div > div'
    img_selector_2 = '#block-4 > div > div > div.col-8.block-text-img-img.is-768 > div > img'
    div_selector_3 = '#block-5 > div > div'
    file_selector = '#specs > div > div.row.product-specs-row > div:nth-child(3) > a'

    # Extract the <h1> text
    h1_text = extract_text_from_selector(driver, h1_selector)
    print(f"Extracted <h1> text: {h1_text}")

    # Create directory based on <h1> text
    h1_folder_name = h1_text.replace('/', '-').replace('\\', '-')  # Sanitize folder name
    product_folder_path = os.path.join(h1_folder_name)
    if not os.path.exists(product_folder_path):
        os.makedirs(product_folder_path)

    # Extract the image URL from the primary image selector
    img_url = extract_image_url_from_selector(driver, img_selector)
    print(f"Extracted image URL: {img_url}")

    # Extract the <h3> text
    h3_text = extract_text_from_selector(driver, h3_selector)
    print(f"Extracted <h3> text: {h3_text}")

    # Extract all image URLs from the specified divs
    image_urls_div_1 = extract_all_images_from_div(driver, div_selector_1)
    image_urls_div_2 = extract_all_images_from_div(driver, div_selector_2)
    img_url_2 = extract_image_url_from_selector(driver, img_selector_2)
    image_urls_div_3 = extract_all_images_from_div(driver, div_selector_3)

    # List all image URLs to download
    all_image_urls = [img_url] + [img_url_2] + image_urls_div_1 + image_urls_div_2 + image_urls_div_3
    print(f"All image URLs to download: {all_image_urls}")

    # Download images directly in the product folder
    for idx, img_url in enumerate(all_image_urls):
        if img_url and img_url.startswith('http'):
            img_file_path = os.path.join(product_folder_path, f'image_{idx + 1}.png')
            download_image(img_url, img_file_path)

    # Extract file download link and download the file
    file_element = attempt_element_locate(driver, file_selector)
    if file_element:
        file_link = file_element.get_attribute('href')
        if file_link and file_link.startswith('http'):
            download_file(file_link, os.path.join(product_folder_path, 'description.pdf'))
        else:
            print("No valid file link found.")
    else:
        print("File element not found.")

# Helper functions for element extraction, image download, and file download
def extract_text_from_selector(driver, css_selector, retries=5, wait_time=30):
    try:
        element = attempt_element_locate(driver, css_selector, retries, wait_time)
        return element.text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "Text not found"

def extract_image_url_from_selector(driver, css_selector, retries=5, wait_time=30):
    try:
        img_element = attempt_element_locate(driver, css_selector, retries, wait_time)
        return img_element.get_attribute('src')
    except Exception as e:
        print(f"Error extracting image URL: {e}")
        return "Image URL not found"

def extract_all_images_from_div(driver, parent_css_selector, retries=5, wait_time=30):
    try:
        parent_div = attempt_element_locate(driver, parent_css_selector, retries, wait_time)
        img_elements = parent_div.find_elements(By.TAG_NAME, 'img')
        image_urls = [img.get_attribute('src') for img in img_elements if img.get_attribute('src')]
        return image_urls
    except Exception as e:
        print(f"Error extracting images from div: {e}")
        return []

def download_image(image_url, output_path):
    try:
        print(f"Attempting to download image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading image: {e}")

def download_file(file_url, output_path):
    try:
        print(f"Attempting to download file from: {file_url}")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Main script execution
product_links = get_product_links()

if product_links:
    for product_link in product_links:
        scrape_product_page(product_link)
else:
    print("No product links found.")

# Close the browser
driver.quit()


In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL of the page you want to scrape (all products page)
main_url = 'https://www.molteni.it/ap/highlights'

# Function to attempt element location with retries using CSS selector
def attempt_element_locate(driver, selector, retries=5, wait_time=30):
    attempt = 0
    while attempt < retries:
        try:
            element = WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            return element
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
    raise Exception(f"Failed to locate element with CSS Selector: {selector} after {retries} attempts")

# Function to extract all product links
def get_product_links():
    driver.get(main_url)
    try:
        # Wait until the section containing the articles is loaded
        wait = WebDriverWait(driver, 10)
        section = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list')))
        
        # Find all article tags within the section
        articles = section.find_elements(By.TAG_NAME, 'article')
        product_links = []
        
        # Loop through each article tag and find all <a> tags inside it
        for article in articles:
            links = article.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href:
                    product_links.append(href)
        
        return product_links

    except TimeoutException:
        print("Loading the section took too long.")
        return []
    except NoSuchElementException:
        print("Could not find the required elements on the page.")
        return []

# Function to scrape product details from each product page
def scrape_product_page(product_url):
    driver.get(product_url)
    
    # Define the CSS selectors
    h1_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element > a > h1'
    h3_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > h3'
    file_selector = '#specs > div > div.row.product-specs-row > div:nth-child(3) > a'

    # Extract the <h1> text
    h1_text = extract_text_from_selector(driver, h1_selector)
    print(f"Extracted <h1> text: {h1_text}")

    # Sanitize folder name
    h1_folder_name = h1_text.replace('/', '-').replace('\\', '-').replace(':', '-').replace('*', '-').replace('?', '-').replace('"', '-').replace('<', '-').replace('>', '-').replace('|', '-')
    product_folder_path = os.path.join(h1_folder_name)
    print(f"Creating directory at path: {product_folder_path}")

    try:
        if not os.path.exists(product_folder_path):
            os.makedirs(product_folder_path)
        print(f"Directory created successfully: {product_folder_path}")
    except Exception as e:
        print(f"Error creating directory: {e}")
        return

    # Extract the <h3> text
    h3_text = extract_text_from_selector(driver, h3_selector)
    print(f"Extracted <h3> text: {h3_text}")

    # Extract all image URLs from the entire page (limited to the first 18)
    all_image_urls = extract_all_images_from_page(driver)
    print(f"All image URLs to download: {all_image_urls}")

    # Download images directly in the product folder
    for idx, img_url in enumerate(all_image_urls[:18]):
        if img_url and img_url.startswith('http'):
            img_file_path = os.path.join(product_folder_path, f'image_{idx + 1}.png')
            download_image(img_url, img_file_path)

    # Extract file download link and download the file
    file_element = attempt_element_locate(driver, file_selector)
    if file_element:
        file_link = file_element.get_attribute('href')
        if file_link and file_link.startswith('http'):
            download_file(file_link, os.path.join(product_folder_path, 'description.pdf'))
        else:
            print("No valid file link found.")
    else:
        print("File element not found.")

# Helper functions for element extraction, image download, and file download
def extract_text_from_selector(driver, css_selector, retries=5, wait_time=30):
    try:
        element = attempt_element_locate(driver, css_selector, retries, wait_time)
        return element.text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "Text not found"

def extract_all_images_from_page(driver, retries=5, wait_time=30):
    try:
        # Retrieve all img elements on the page
        img_elements = driver.find_elements(By.TAG_NAME, 'img')
        image_urls = [img.get_attribute('src') for img in img_elements if img.get_attribute('src')]
        return image_urls
    except Exception as e:
        print(f"Error extracting images from page: {e}")
        return []

def download_image(image_url, output_path):
    try:
        print(f"Attempting to download image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading image: {e}")

def download_file(file_url, output_path):
    try:
        print(f"Attempting to download file from: {file_url}")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Main script execution
product_links = get_product_links()

if product_links:
    for product_link in product_links:
        scrape_product_page(product_link)
else:
    print("No product links found.")

# Close the browser
driver.quit()


In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL of the page you want to scrape (all products page)
main_url = 'https://www.molteni.it/ap/highlights'

# Function to attempt element location with retries using CSS selector
def attempt_element_locate(driver, selector, retries=5, wait_time=30):
    attempt = 0
    while attempt < retries:
        try:
            element = WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            return element
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
    raise Exception(f"Failed to locate element with CSS Selector: {selector} after {retries} attempts")

# Function to extract all product links
def get_product_links():
    driver.get(main_url)
    try:
        # Wait until the section containing the articles is loaded
        wait = WebDriverWait(driver, 10)
        section = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list')))
        
        # Find all article tags within the section
        articles = section.find_elements(By.TAG_NAME, 'article')
        product_links = []
        
        # Loop through each article tag and find all <a> tags inside it
        for article in articles:
            links = article.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href:
                    product_links.append(href)
        
        return product_links

    except TimeoutException:
        print("Loading the section took too long.")
        return []
    except NoSuchElementException:
        print("Could not find the required elements on the page.")
        return []

# Function to scrape product details from each product page
def scrape_product_page(product_url):
    driver.get(product_url)
    
    # Define the CSS selectors
    h1_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element > a > h1'
    h3_selector = 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > h3'
    file_selector = '#specs > div > div.row.product-specs-row > div:nth-child(3) > a'

    # Extract the <h1> text
    h1_text = extract_text_from_selector(driver, h1_selector)
    print(f"Extracted <h1> text: {h1_text}")

    # Sanitize folder name
    h1_folder_name = h1_text.replace('/', '-').replace('\\', '-').replace(':', '-').replace('*', '-').replace('?', '-').replace('"', '-').replace('<', '-').replace('>', '-').replace('|', '-')
    product_folder_path = os.path.join(h1_folder_name)
    print(f"Creating directory at path: {product_folder_path}")

    try:
        if not os.path.exists(product_folder_path):
            os.makedirs(product_folder_path)
        print(f"Directory created successfully: {product_folder_path}")
    except Exception as e:
        print(f"Error creating directory: {e}")
        return

    # Extract the <h3> text
    h3_text = extract_text_from_selector(driver, h3_selector)
    print(f"Extracted <h3> text: {h3_text}")

    # Extract all image URLs from the page (limited to the first 18)
    all_image_urls = extract_images_between_sections(driver, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div', '#block-14 > div')
    print(f"All image URLs to download: {all_image_urls}")

    # Download images directly in the product folder
    for idx, img_url in enumerate(all_image_urls[:18]):
        if img_url and img_url.startswith('http'):
            img_file_path = os.path.join(product_folder_path, f'image_{idx + 1}.png')
            download_image(img_url, img_file_path)

    # Extract file download link and download the file
    file_element = attempt_element_locate(driver, file_selector)
    if file_element:
        file_link = file_element.get_attribute('href')
        if file_link and file_link.startswith('http'):
            download_file(file_link, os.path.join(product_folder_path, 'description.pdf'))
        else:
            print("No valid file link found.")
    else:
        print("File element not found.")

# Helper functions for element extraction, image download, and file download
def extract_text_from_selector(driver, css_selector, retries=5, wait_time=30):
    try:
        element = attempt_element_locate(driver, css_selector, retries, wait_time)
        return element.text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return "Text not found"

def extract_images_between_sections(driver, start_selector, end_selector, retries=5, wait_time=30):
    try:
        # Retrieve the start and end elements
        start_element = attempt_element_locate(driver, start_selector, retries, wait_time)
        end_element = attempt_element_locate(driver, end_selector, retries, wait_time)

        # Find all img elements on the page
        img_elements = driver.find_elements(By.TAG_NAME, 'img')
        image_urls = []
        start_found = False
        end_found = False

        for img in img_elements:
            # Check if the image is located within the start section
            parent_element = img.find_element(By.XPATH, '..')  # Find the parent of the img
            parent_id = parent_element.get_attribute('id')

            if start_element in parent_element.find_elements(By.XPATH, '..'):
                start_found = True

            if start_found and not end_found:
                # Add image URL to the list
                image_url = img.get_attribute('src')
                if image_url:
                    image_urls.append(image_url)

            if end_element in parent_element.find_elements(By.XPATH, '..'):
                end_found = True
                break

        return image_urls
    except Exception as e:
        print(f"Error extracting images from page: {e}")
        return []

def download_image(image_url, output_path):
    try:
        print(f"Attempting to download image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading image: {e}")

def download_file(file_url, output_path):
    try:
        print(f"Attempting to download file from: {file_url}")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully: {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Main script execution
product_links = get_product_links()

if product_links:
    for product_link in product_links:
        scrape_product_page(product_link)
else:
    print("No product links found.")

# Close the browser
driver.quit()


In [None]:
# this code is working for one product only in second category 
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

# URL of the page you want to scrape (specific product page)
main_url = 'https://www.molteni.it/ap/product/intersection'

# Open the webpage
driver.get(main_url)

def download_image(image_url, folder_path, image_name):
    try:
        # Get image content
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors

        # Define the path where the image will be saved
        image_path = os.path.join(folder_path, image_name)

        # Save the image
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved image: {image_name}")
    
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_file(file_url, folder_path, file_name):
    try:
        # Get file content
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors

        # Define the path where the file will be saved
        file_path = os.path.join(folder_path, file_name)

        # Save the file
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_images_from_selector(selector, folder_path, image_counter):
    try:
        # Wait for the containers to be present
        container_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        
        # Extract images from all containers
        for container_div in container_divs:
            # Find all images inside the container
            all_images = container_div.find_elements(By.TAG_NAME, "img")
            
            # Loop through each found image and get the 'src' attribute
            for img in all_images:
                image_src = img.get_attribute('src')
                image_name = f"image_{image_counter}.png"
                download_image(image_src, folder_path, image_name)
                image_counter += 1
    
    except (NoSuchElementException, TimeoutException):
        print(f"No images found inside the container {selector} or timeout occurred.")
    
    return image_counter

try:
    # Extract text from h1 tag inside a specific div
    text_selector = "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element"
    
    try:
        # Wait for the div to be present
        info_div = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, text_selector))
        )
        
        # Find the <a> tag inside the div
        a_tag = info_div.find_element(By.TAG_NAME, "a")
        
        # Find the <h1> tag inside the <a> tag and get its text
        h1_tag = a_tag.find_element(By.TAG_NAME, "h1")
        h1_text = h1_tag.text.strip()
        print("H1 Text:", h1_text)
        
        # Create a directory with the H1 text as its name
        folder_path = os.path.join(os.getcwd(), h1_text.replace('/', '_').replace('\\', '_'))  # Replace invalid characters
        os.makedirs(folder_path, exist_ok=True)
    
    except (NoSuchElementException, TimeoutException):
        print("H1 tag not found inside the specified container or timeout occurred.")
        folder_path = os.getcwd()  # Use current directory if H1 text not found

    # Extract images from all specified selectors and save to the created folder
    selectors = [
        "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1)",
        "#block-0",
        "#block-1",
        "#block-2",
        "#block-3",
        "#block-4",
        "#block-5",
        "#block-6",
        "#block-7",
        "#block-8",
        "#block-9",
        "#block-10",
        "#block-11"
    ]
    
    image_counter = 1
    for selector in selectors:
        image_counter = extract_images_from_selector(selector, folder_path, image_counter)

    # Download the file from the specified anchor tag
    anchor_selector = "#specs > div > div.row.product-specs-row > div:nth-child(3) > a"
    
    try:
        # Wait for the anchor tag to be present
        anchor_tag = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, anchor_selector))
        )
        
        # Get the URL from the href attribute
        file_url = anchor_tag.get_attribute('href')
        file_name = "description.pdf"  # Set filename for the downloaded file
        
        # Download the file
        download_file(file_url, folder_path, file_name)
    
    except (NoSuchElementException, TimeoutException):
        print("Anchor tag not found inside the specified container or timeout occurred.")

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser window
    driver.quit()


In [None]:
# This code is working for Kitchen category and will get all products
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def download_image(image_url, folder_path, image_name):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        image_path = os.path.join(folder_path, image_name)
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved image: {image_name}")
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_images_from_selector(selector, folder_path, image_counter):
    try:
        container_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        for container_div in container_divs:
            all_images = container_div.find_elements(By.TAG_NAME, "img")
            for img in all_images:
                image_src = img.get_attribute('src')
                image_name = f"image_{image_counter}.png"
                download_image(image_src, folder_path, image_name)
                image_counter += 1
    except (NoSuchElementException, TimeoutException):
        print(f"No images found inside the container {selector} or timeout occurred.")
    return image_counter

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        product_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list > article > a'))
        )
        for element in product_elements:
            link = element.get_attribute('href')
            if link:
                links.append(link)
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def scrape_product_page(product_url):
    driver.get(product_url)
    try:
        text_selector = "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element"
        try:
            info_div = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, text_selector))
            )
            a_tag = info_div.find_element(By.TAG_NAME, "a")
            h1_tag = a_tag.find_element(By.TAG_NAME, "h1")
            h1_text = h1_tag.text.strip()
            print("H1 Text:", h1_text)
            folder_path = os.path.join(os.getcwd(), h1_text.replace('/', '_').replace('\\', '_'))
            os.makedirs(folder_path, exist_ok=True)
        except (NoSuchElementException, TimeoutException):
            print("H1 tag not found inside the specified container or timeout occurred.")
            folder_path = os.getcwd()

        selectors = [
            "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1)",
            "#block-0",
            "#block-1",
            "#block-2",
            "#block-3",
            "#block-4",
            "#block-5",
            "#block-6",
            "#block-7",
            "#block-8",
            "#block-9",
            "#block-10",
            "#block-11"
        ]
        image_counter = 1
        for selector in selectors:
            image_counter = extract_images_from_selector(selector, folder_path, image_counter)

        anchor_selector = "#specs > div > div.row.product-specs-row > div:nth-child(3) > a"
        try:
            anchor_tag = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, anchor_selector))
            )
            file_url = anchor_tag.get_attribute('href')
            file_name = "description.pdf"
            download_file(file_url, folder_path, file_name)
        except (NoSuchElementException, TimeoutException):
            print("Anchor tag not found inside the specified container or timeout occurred.")

    except Exception as e:
        print(f"An error occurred while processing the product page {product_url}: {e}")

try:
    base_url = 'https://www.molteni.it/ap/kitchens/category/highlights'
    product_links = get_product_links(base_url)
    for link in product_links:
        scrape_product_page(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:

import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def download_image(image_url, folder_path, image_name):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        image_path = os.path.join(folder_path, image_name)
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved image: {image_name}")
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_images_from_selector(selector, folder_path, image_counter):
    try:
        container_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        for container_div in container_divs:
            all_images = container_div.find_elements(By.TAG_NAME, "img")
            for img in all_images:
                image_src = img.get_attribute('src')
                image_name = f"image_{image_counter}.png"
                download_image(image_src, folder_path, image_name)
                image_counter += 1
    except (NoSuchElementException, TimeoutException):
        print(f"No images found inside the container {selector} or timeout occurred.")
    return image_counter

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        product_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section.product-category > div > div > div > section.content-block.catalog-list > article > a'))
        )
        for element in product_elements:
            link = element.get_attribute('href')
            if link:
                links.append(link)
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def scrape_product_page(product_url):
    driver.get(product_url)
    try:
        text_selector = "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1) > div > article > div.block-text-img-text.animation-fade-in > div > div > div.block-info-product__top.padding-line-element"
        try:
            info_div = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, text_selector))
            )
            a_tag = info_div.find_element(By.TAG_NAME, "a")
            h1_tag = a_tag.find_element(By.TAG_NAME, "h1")
            h1_text = h1_tag.text.strip()
            print("H1 Text:", h1_text)
            folder_path = os.path.join(os.getcwd(), h1_text.replace('/', '_').replace('\\', '_'))
            os.makedirs(folder_path, exist_ok=True)
        except (NoSuchElementException, TimeoutException):
            print("H1 tag not found inside the specified container or timeout occurred.")
            folder_path = os.getcwd()

        selectors = [
            "body > div.wrapper-site.avgrund-contents.no-ecommerce-bar > section:nth-child(1)",
            "#block-0",
            "#block-1",
            "#block-2",
            "#block-3",
            "#block-4",
            "#block-5",
            "#block-6",
            "#block-7",
            "#block-8",
            "#block-9",
            "#block-10",
            "#block-11"
        ]
        image_counter = 1
        for selector in selectors:
            image_counter = extract_images_from_selector(selector, folder_path, image_counter)

        anchor_selector = "#specs > div > div.row.product-specs-row > div:nth-child(3) > a"
        try:
            anchor_tag = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, anchor_selector))
            )
            file_url = anchor_tag.get_attribute('href')
            file_name = "description.pdf"
            download_file(file_url, folder_path, file_name)
        except (NoSuchElementException, TimeoutException):
            print("Anchor tag not found inside the specified container or timeout occurred.")

    except Exception as e:
        print(f"An error occurred while processing the product page {product_url}: {e}")

try:
    base_url = 'https://www.molteni.it/ap/gio-ponti/category/highlights'
    product_links = get_product_links(base_url)
    for link in product_links:
        scrape_product_page(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:
import os
import re
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def sanitize_filename(filename):
    # Remove any characters that are invalid in Windows file names and trim whitespace
    filename = re.sub(r'[<>:"/\\|?*\n\r]', '', filename).strip()
    return filename

def download_image(image_url, folder_path, image_name):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        image_path = os.path.join(folder_path, image_name)
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved image: {image_name}")
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_images_from_selector(selector, folder_path, image_counter):
    try:
        container_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        for container_div in container_divs:
            all_images = container_div.find_elements(By.TAG_NAME, "img")
            for img in all_images:
                image_src = img.get_attribute('src')
                if image_src:
                    image_name = f"image_{image_counter}.png"
                    download_image(image_src, folder_path, image_name)
                    image_counter += 1
    except (NoSuchElementException, TimeoutException):
        print(f"No images found inside the container {selector} or timeout occurred.")
    return image_counter

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        # Use the provided selector to locate the container with multiple tags
        container_selector = "#c25957 > div > section.container.prv-list.show-sofa-1"
        
        # Wait for the container to be present and extract anchor elements from it
        container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, container_selector))
        )
        
        # Find all <a> tags within the container
        product_elements = container.find_elements(By.TAG_NAME, 'a')
        
        # Extract the URLs from the href attributes of each <a> tag
        for element in product_elements:
            link = element.get_attribute('href')
            if link:
                links.append(link)
                print(f"Product link found: {link}")
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def scrape_product_page(product_url):
    driver.get(product_url)
    try:
            
        # Use the new H1 selector for folder naming
        h1_selector = "#c205373 > div > section > div > div > h1"
        try:
            h1_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, h1_selector))
            )
            h1_text = h1_element.text.strip()
            print("H1 Text (Folder Name):", h1_text)
            folder_name = sanitize_filename(h1_text)
            folder_path = os.path.join(os.getcwd(), folder_name)
            os.makedirs(folder_path, exist_ok=True)
        except (NoSuchElementException, TimeoutException):
            print("H1 tag not found or timeout occurred.")
            folder_path = os.getcwd()  # Default to current directory

        # New selectors for images
        selectors = [
            "#c206647",  # First container with multiple tags with images
            "body > main > section:nth-child(8)",  # Second container with multiple divs containing images
            "body > main > section:nth-child(9)"   # Third container with multiple divs containing images
        ]
        
        # Extract images from new selectors
        image_counter = 1
        for selector in selectors:
            image_counter = extract_images_from_selector(selector, folder_path, image_counter)

        # New selector for file download
        file_selector = "#c205373 > div > section > div > div > ul:nth-child(4) > li:nth-child(2) > a"
        try:
            anchor_tag = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, file_selector))
            )
            file_url = anchor_tag.get_attribute('href')
            file_name = "description.pdf"
            download_file(file_url, folder_path, file_name)
        except (NoSuchElementException, TimeoutException):
            print("File download link not found or timeout occurred.")

    except Exception as e:
        print(f"An error occurred while processing the product page {product_url}: {e}")

try:
    base_url = 'https://www.rolf-benz.com/en_OC/furniture/sofas'
    product_links = get_product_links(base_url)
    print(f"Total product links found: {len(product_links)}")
    
    # Visit each product page and scrape the required data
    for link in product_links:
        scrape_product_page(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        # Use the provided selector to locate the container with multiple tags
        container_selector = "#c25957 > div > section.container.prv-list.show-sofa-1"
        
        # Wait for the container to be present and extract anchor elements from it
        container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, container_selector))
        )
        
        # Find all <a> tags within the container
        product_elements = container.find_elements(By.TAG_NAME, 'a')
        
        # Extract the URLs from the href attributes of each <a> tag
        for element in product_elements:
            link = element.get_attribute('href')
            if link:
                links.append(link)
                print(f"Product link found: {link}")
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_and_save_product_info(url):
    driver.get(url)
    try:
        # Extract text after "sofas/"
        if "sofas/" in url:
            text_after_sofas = url.split("sofas/")[1]
            # Add "rolf-benz" if not present
            if "rolf-benz" not in text_after_sofas:
                text_after_sofas = f"rolf-benz {text_after_sofas}"
            print(f"Extracted text: {text_after_sofas}")
            
            # Create a folder with the extracted text
            folder_name = text_after_sofas.replace('/', '_').replace('\\', '_')
            folder_path = os.path.join(os.getcwd(), folder_name)
            os.makedirs(folder_path, exist_ok=True)
            
            # List of XPaths to try for downloading the file
            file_xpaths = [
                "/html/body/main/div[9]/section/div/div/ul/li[1]/a",
                "/html/body/main/div[5]/div/section/div/div/ul[1]/li[2]/a",
                "/html/body/main/div[2]/div/section/div/div/ul[1]/li[1]/a",
                "/html/body/main/div[2]/div/section/div/div/ul[1]/li[2]/a",
                "/html/body/main/div[8]/section/div/div/ul/li[1]/a",
                "/html/body/main/div[2]/div/section/div/div/ul[1]/li[1]/a",
                "/html/body/main/div[8]/section/div/div/ul/li/a",
                "/html/body/main/div[6]/div/section/div/div/ul[1]/li[2]/a",
                "/html/body/main/div[2]/div/section/div/div/ul[1]/li/a",
                "/html/body/main/div[8]/section/div/div/ul/li[1]/a"
            ]
            
            file_url = None
            for xpath in file_xpaths:
                try:
                    file_element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.XPATH, xpath))
                    )
                    file_url = file_element.get_attribute('href')
                    if file_url:
                        break
                except (NoSuchElementException, TimeoutException):
                    continue
            
            if file_url:
                download_file(file_url, folder_path, 'description.pdf')
            else:
                print("File not found using any provided XPaths.")
                
        else:
            print("URL does not contain 'sofas/'.")
    except Exception as e:
        print(f"An error occurred while processing the URL {url}: {e}")

try:
    base_url = 'https://www.rolf-benz.com/en_OC/furniture/sofas'
    product_links = get_product_links(base_url)
    print(f"Total product links found: {len(product_links)}")
    
    # Visit each product page and process the information
    for link in product_links:
        extract_and_save_product_info(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:
import os
import requests
from io import BytesIO
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def get_product_links(base_url):
    driver.get(base_url)
    links = []
    try:
        # Use the updated XPath to find the container
        container_xpath = "/html/body/main/div/div/section[2]/div"
        container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, container_xpath))
        )
        
        # Find all <a> tags within the container
        product_elements = container.find_elements(By.TAG_NAME, 'a')
        for element in product_elements:
            link = element.get_attribute('href')
            if link and link.startswith("http"):  # Ensure it's a valid URL
                links.append(link)
                print(f"Product link found: {link}")
    except (NoSuchElementException, TimeoutException) as e:
        print(f"An error occurred while fetching product links: {e}")
    return links

def download_image(image_url, folder_path, image_counter):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()
        
        # Open the image using Pillow
        image = Image.open(BytesIO(response.content))
        
        # Convert the image to PNG format
        image = image.convert("RGBA")
        
        image_name = f"image_{image_counter}.png"
        image_path = os.path.join(folder_path, image_name)
        
        # Save the image in PNG format
        image.save(image_path, format="PNG")
        print(f"Saved image: {image_name}")
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def extract_images_from_div(div_element, folder_path, image_counter):
    try:
        images = div_element.find_elements(By.TAG_NAME, "img")
        for img in images:
            image_src = img.get_attribute('src')
            if image_src:
                download_image(image_src, folder_path, image_counter)
                image_counter += 1
    except Exception as e:
        print(f"An error occurred while extracting images: {e}")
    return image_counter

def download_file(file_url, folder_path, file_name):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()
        
        # Save the file to the specified folder
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Saved file: {file_name}")
    except Exception as e:
        print(f"Failed to download {file_url}. Reason: {e}")

def extract_files_from_page(folder_path):
    file_xpaths = [
        "/html/body/main/div[2]/div/section/div/div/ul[1]/li/a",
        "/html/body/main/div[2]/div/section/div/div/ul[1]/li[1]/a"
    ]
    
    for file_xpath in file_xpaths:
        try:
            file_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, file_xpath))
            )
            file_url = file_element.get_attribute('href')
            if file_url and file_url.endswith(".pdf"):
                file_name = "description.pdf"
                download_file(file_url, folder_path, file_name)
        except (NoSuchElementException, TimeoutException):
            continue

def extract_and_save_product_info(url):
    driver.get(url)
    try:
        # If "beds/" is in the URL, extract the text after it
        if "beds/" in url:
            text_after_beds = url.split("beds/")[1]
        else:
            # If "beds/" is not in the URL, use the whole URL and ensure the folder contains "beds"
            text_after_beds = url.split('/')[-1]
            text_after_beds = f"beds_{text_after_beds}"
        
        if "rolf-benz" not in text_after_beds:
            text_after_beds = f"rolf-benz {text_after_beds}"
        print(f"Extracted text: {text_after_beds}")
        
        # Create the folder name by sanitizing the extracted text
        folder_name = text_after_beds.replace('/', '_').replace('\\', '_')
        folder_path = os.path.join(os.getcwd(), folder_name)
        os.makedirs(folder_path, exist_ok=True)
        
        # List of XPaths to try for downloading images
        div_xpaths = [
            "/html/body/main/div[1]/section/div/div",
            "/html/body/main/div[5]/section/div/div/div/div",
            "/html/body/main/div[6]/section/div/div/div/div",
            "/html/body/main/section[1]",
            "/html/body/main/section[2]",
            "/html/body/main/div[1]/section/div/div/div/div[1]",
            "/html/body/main/div[3]/section",
            "/html/body/main/div[3]/section/div/div/div/div[1]",
            "/html/body/main/section[1]/div/div[1]/div/div/div",
            "/html/body/main/div[5]/section/div/div/div/div",
            "/html/body/main/div[6]",
            "/html/body/main/div[5]/section/div/div/div/div[1]"
        ]
        
        image_counter = 1
        for xpath in div_xpaths:
            try:
                div_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, xpath))
                )
                image_counter = extract_images_from_div(div_element, folder_path, image_counter)
            except (NoSuchElementException, TimeoutException):
                continue

        # Download files (PDFs)
        extract_files_from_page(folder_path)
        
    except Exception as e:
        print(f"An error occurred while processing the URL {url}: {e}")

try:
    base_url = 'https://www.rolf-benz.com/en_OC/furniture/beds'
    product_links = get_product_links(base_url)
    print(f"Total product links found: {len(product_links)}")
    
    for link in product_links:
        extract_and_save_product_info(link)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:
import os
import requests
from io import BytesIO
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from urllib.parse import urljoin

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def download_image(image_url, folder_path, image_counter, base_url):
    try:
        # Convert relative URLs to absolute URLs
        if not image_url.startswith("http"):
            image_url = urljoin(base_url, image_url)

        response = requests.get(image_url, stream=True)
        response.raise_for_status()

        # Open the image using Pillow
        image = Image.open(BytesIO(response.content))

        # Convert the image to PNG format
        image = image.convert("RGBA")

        image_name = f"image_{image_counter}.png"
        image_path = os.path.join(folder_path, image_name)

        # Save the image in PNG format
        image.save(image_path, format="PNG")
        print(f"Saved image: {image_name}")
    except Exception as e:
        print(f"Failed to download {image_url}. Reason: {e}")

def download_pdf(pdf_url, folder_path):
    try:
        # Convert relative URLs to absolute URLs
        if not pdf_url.startswith("http"):
            pdf_url = urljoin(url, pdf_url)

        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()

        pdf_path = os.path.join(folder_path, "description.pdf")
        with open(pdf_path, 'wb') as f:
            f.write(response.content)

        print(f"Saved PDF: description.pdf")
    except Exception as e:
        print(f"Failed to download PDF {pdf_url}. Reason: {e}")

def extract_images_from_main(main_element, folder_path, image_counter, base_url):
    try:
        # Find all <img> tags within the main element
        images = main_element.find_elements(By.TAG_NAME, "img")
        for img in images:
            # Check both 'src' and 'data-src' for lazy-loaded images
            image_src = img.get_attribute('src') or img.get_attribute('data-src')
            if image_src:
                print(f"Trying to download image: {image_src}")
                download_image(image_src, folder_path, image_counter, base_url)
                image_counter += 1
    except Exception as e:
        print(f"An error occurred while extracting images: {e}")
    return image_counter

def extract_and_save_product_info(url):
    driver.get(url)

    try:
        # Define the folder name for this page
        folder_name = "rolf-benz-wardrobe"
        folder_path = os.path.join(os.getcwd(), folder_name)
        os.makedirs(folder_path, exist_ok=True)

        # Find the main element using the CSS selector 'body > main'
        main_element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body > main"))
        )

        image_counter = 1
        # Extract all images from within the main tag
        image_counter = extract_images_from_main(main_element, folder_path, image_counter, url)

        # Find the PDF link and download the PDF
        pdf_link = driver.find_element(By.CSS_SELECTOR, "#c162599 > div > section > div > div > ul:nth-child(3) > li > a")
        pdf_url = pdf_link.get_attribute('href')
        if pdf_url:
            print(f"Trying to download PDF: {pdf_url}")
            download_pdf(pdf_url, folder_path)

    except Exception as e:
        print(f"An error occurred while processing the URL {url}: {e}")

try:
    # URL for the Rolf Benz Stretto wardrobe furniture page
    url = 'https://www.rolf-benz.com/en_OC/furniture/wardrobe-furniture/stretto'
    extract_and_save_product_info(url)

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def load_all_images():
    try:
        # JavaScript path to find the "Load More" button
        load_images_button_js = "document.querySelector('#container-6ac6ef4ec3 > div > div.productlist.productcollection.margin-bottom-xxl.aem-GridColumn.aem-GridColumn--default--12 > article > button')"
        
        # Wait for the page to fully load before interacting
        print("Waiting for the page to load for 15 seconds...")
        time.sleep(15)

        # Track if the "Load More" button is visible
        button_visible = True
        
        while button_visible:
            try:
                # Execute JavaScript to find the "Load More" button
                button = driver.execute_script(f"return {load_images_button_js};")
                
                if button and button.is_displayed():
                    driver.execute_script("arguments[0].click();", button)
                    print("Clicked 'Load More' button.")
                    
                    # Wait for a short period before checking again to avoid immediate page reload
                    time.sleep(10)
                    
                    # Wait for new images to load
                    WebDriverWait(driver, 10).until(
                        EC.staleness_of(button)  # Wait until the button is no longer clickable
                    )
                else:
                    button_visible = False
            except (NoSuchElementException, TimeoutException, ElementClickInterceptedException) as e:
                print(f"No more 'Load More' button or unable to click. Error: {e}")
                button_visible = False
    except Exception as e:
        print(f"An error occurred while loading all images: {e}")

def get_product_name():
    """Get the name of the product from the current page."""
    try:
        # XPath to find the product name
        name_xpath = "/html/body/div[1]/div/main/div/div/div/div/div/div[2]/article/section/div[3]/div[1]/div/a/div[2]"
        
        # Wait for the name element to appear on the product page
        name_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, name_xpath))
        )
        product_name = name_element.text.strip()
        print(f"Product name: {product_name}")
        return product_name

    except (NoSuchElementException, TimeoutException) as e:
        print(f"Error fetching product name: {e}")
        return None

def get_product_links(base_url):
    """Fetch product links and immediately process each link."""
    driver.get(base_url)

    # Wait for the page to fully load before extracting product links
    print("Waiting for the page to load for 15 seconds...")
    time.sleep(15)

    links = []
    try:
        # CSS selector to find the container with product links
        container_selector = "#container-6ac6ef4ec3 > div > div.productlist.productcollection.margin-bottom-xxl.aem-GridColumn.aem-GridColumn--default--12 > article > section > div.gallery__items.search__items"
        load_more_button_js = "document.querySelector('#container-6ac6ef4ec3 > div > div.productlist.productcollection.margin-bottom-xxl.aem-GridColumn.aem-GridColumn--default--12 > article > button')"

        while True:
            # Wait for the container to be present
            container = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, container_selector))
            )
            
            # Find all <a> tags within the container
            product_elements = container.find_elements(By.TAG_NAME, 'a')
            for element in product_elements:
                link = element.get_attribute('href')
                if link and link.startswith("http"):  # Ensure it's a valid URL
                    if link not in links:
                        links.append(link)
                        print(f"Product link found: {link}")
                        
                        # Visit the product link to fetch the product name and create folder
                        visit_product_link_and_create_folder(link)
                        
            try:
                # Execute JavaScript to find and click the "Load More" button
                load_more_button = driver.execute_script(f"return {load_more_button_js};")
                if load_more_button and load_more_button.is_displayed():
                    driver.execute_script("arguments[0].click();", load_more_button)
                    print("Clicked 'Load More' button.")
                    
                    # Wait for new products to load
                    WebDriverWait(driver, 10).until(
                        EC.staleness_of(load_more_button)  # Wait until the button is no longer clickable
                    )
                else:
                    break
            except (NoSuchElementException, TimeoutException) as e:
                print(f"No more 'Load More' button or unable to click. Error: {e}")
                break

    except Exception as e:
        print(f"An error occurred while fetching product links: {e}")
    
    return links

def visit_product_link_and_create_folder(link):
    """Visit each product link, get the name, and create a folder for it."""
    try:
        driver.get(link)
        print(f"Visited product link: {link}")
        
        # Wait for the page to fully load
        time.sleep(5)

        # Get the product name from the product page
        product_name = get_product_name()
        if product_name:
            # Sanitize product name to be a valid folder name
            sanitized_name = "".join(c for c in product_name if c.isalnum() or c in (" ", "_")).strip()
            product_folder = os.path.join(os.getcwd(), sanitized_name)
            os.makedirs(product_folder, exist_ok=True)

            print(f"Created folder '{product_folder}' for product '{product_name}'")

    except Exception as e:
        print(f"An error occurred while visiting {link}: {e}")

# Example usage
try:
    base_url = 'https://www.poltronafrau.com/ww/en/products/products.87.html?category_id=87&selectedFilters=pf_info_categoria&pf_info_categoria=9088'
    
    # Load all images by clicking the 'Load All Images' button
    load_all_images()

    # Fetch product links, visit each product link, and create folders
    product_links = get_product_links(base_url)
    print(f"Total product links found and processed: {len(product_links)}")

except (NoSuchElementException, TimeoutException) as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [None]:
import os
import time
import requests
import zipfile
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def sanitize_filename(filename):
    """ Remove or replace invalid characters from filenames/folders """
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    return filename

def download_image(image_url, folder_name, idx):
    try:
        # Ensure the image URL is absolute
        if not image_url.startswith("http"):
            image_url = "https://www.poltronafrau.com" + image_url

        # Create the download folder using the sanitized folder name
        download_folder = os.path.join(os.getcwd(), folder_name)
        os.makedirs(download_folder, exist_ok=True)

        # Download the image
        response = requests.get(image_url)
        if response.status_code == 200:
            image_path = os.path.join(download_folder, f'image_{idx + 1}.png')
            with open(image_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded image {idx + 1} in folder '{folder_name}' from URL: {image_url}")
        else:
            print(f"Failed to download image {idx + 1}, status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading image {idx + 1}: {e}")

def download_images_from_product_page(folder_name):
    try:
        # Locate the image elements with the srcset attribute
        img_selector = "img.cmp-image__image"
        image_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, img_selector))
        )

        for idx, img_element in enumerate(image_elements):
            # Extract the srcset attribute (which contains URLs for different image resolutions)
            srcset = img_element.get_attribute('srcset')
            if not srcset:
                # Fallback to data-srcset if srcset is not present
                srcset = img_element.get_attribute('data-srcset')

            if srcset:
                # Split the srcset string into individual image URLs and resolutions
                srcset_items = [item.strip() for item in srcset.split(",")]

                # Extract the highest resolution image (assuming it's the last in the srcset)
                highest_res_image = srcset_items[-1].split()[0]
                print(f"Highest resolution image URL: {highest_res_image}")

                # Download the highest-resolution image
                download_image(highest_res_image, folder_name, idx)
            else:
                # If no srcset is found, attempt to download the standard src image
                src = img_element.get_attribute('src')
                if src:
                    print(f"Falling back to src image: {src}")
                    download_image(src, folder_name, idx)
                else:
                    print(f"No src or srcset found for image {idx + 1}")

    except TimeoutException as e:
        print(f"Error locating images: {e}")

def get_folder_name_from_url(product_url):
    """Extract folder name from the product URL"""
    try:
        # Extract the folder name from the URL after 'products/' and before '.html'
        part_after_products = product_url.split('products/')[1]
        folder_name = part_after_products.split('.html')[0]
        return sanitize_filename(folder_name)
    except Exception as e:
        print(f"Error extracting folder name: {e}")
        return "default_folder"

def visit_product_page_and_download_images(product_url):
    try:
        print(f"Visiting product page: {product_url}")
        driver.get(product_url)

        # Wait for the page to load
        time.sleep(5)

        # Get the folder name based on the product URL
        folder_name = get_folder_name_from_url(product_url)

        # Download images from the product page
        download_images_from_product_page(folder_name)

        # After downloading images, click on the "Downloads" tab
        click_download_tab()

        # After clicking the download tab, download the ZIP file and extract its contents
        download_and_extract_zip_file(folder_name)

    except Exception as e:
        print(f"Error visiting product page {product_url}: {e}")

def click_download_tab():
    """Clicks the 'Downloads' tab based on the provided selector"""
    try:
        # Wait for the tab element to be clickable and then click it
        tab_selector = "#producttabs-3baf65de06-item-25a16ecc4d-tab"
        download_tab = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, tab_selector))
        )
        download_tab.click()
        print("Successfully clicked the 'Downloads' tab.")
        
    except TimeoutException as e:
        print(f"Error clicking the 'Downloads' tab: {e}")

def download_and_extract_zip_file(folder_name):
    """Download the ZIP file from the newly appeared <a> tag and extract it into the same folder"""
    try:
        # Wait for the <a> tag to become visible
        a_tag_selector = "#professionals > div > div > div:nth-child(1) > div > h3 > button > a"
        a_tag_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, a_tag_selector))
        )

        # Get the URL from the href attribute
        file_url = a_tag_element.get_attribute('href')
        if file_url:
            print(f"Found ZIP file download URL: {file_url}")

            # Create a downloads folder if it doesn't exist
            download_folder = os.path.join(os.getcwd(), folder_name)
            os.makedirs(download_folder, exist_ok=True)

            # Download the ZIP file with headers and cookies
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
                'Referer': driver.current_url,
            }
            # Extract cookies from the Selenium session
            cookies = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()}

            # Download the ZIP file
            response = requests.get(file_url, headers=headers, cookies=cookies)
            if response.status_code == 200:
                # Extract the ZIP file in-memory
                with zipfile.ZipFile(BytesIO(response.content)) as zip_file:
                    # Extract all contents to the download folder
                    zip_file.extractall(download_folder)
                    print(f"Extracted ZIP contents to folder '{folder_name}'")

                    # Process the PDF file inside the ZIP file
                    for file_name in zip_file.namelist():
                        if file_name.endswith('.pdf'):
                            pdf_path = os.path.join(download_folder, file_name)
                            print(f"Found PDF file: {pdf_path}")
                            # Process the PDF file as needed (e.g., move, read, etc.)
            else:
                print(f"Failed to download ZIP file, status code: {response.status_code}")
        else:
            print("No href found for the download link.")
        
    except TimeoutException as e:
        print(f"Error locating the download link: {e}")

def extract_product_links(list_page_url):
    try:
        print(f"Visiting list page: {list_page_url}")
        driver.get(list_page_url)

        # Click the 'Load More' button if present to load all products
        while True:
            try:
                load_more_button_selector = "#container-6ac6ef4ec3 > div > div.productlist.productcollection.margin-bottom-xxl.aem-GridColumn.aem-GridColumn--default--12 > article > button"
                load_more_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, load_more_button_selector))
                )
                load_more_button.click()
                print("Clicked 'Load More' button.")
                time.sleep(3)  # Wait for new products to load
            except TimeoutException:
                break

        # Extract product links
        product_selector = "#container-6ac6ef4ec3 > div > div.productlist.productcollection.margin-bottom-xxl.aem-GridColumn.aem-GridColumn--default--12 > article > section > div.gallery__items.search__items a"
        product_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, product_selector))
        )

        product_links = [elem.get_attribute('href') for elem in product_elements if elem.get_attribute('href')]
        print(f"Found {len(product_links)} product links.")

        # Save product links to a file
        with open('product_links.txt', 'w') as file:
            for link in product_links:
                file.write(link + '\n')

        return product_links
    except Exception as e:
        print(f"Error extracting product links: {e}")
        return []

# Main script execution
try:
    list_page_url = 'https://www.poltronafrau.com/ww/en/products/products.87.html?_gl=1*1fbjz9y*_up*MQ..*_ga*MTY0MDIyNDUyNy4xNzI1OTcyOTc1*_ga_YGJJL14S4G*MTcyNTk3Mjk3NC4xLjEuMTcyNTk3MzA5NC4wLjAuMA..&pf_info_categoria=9103&selectedFilters=pf_info_categoria'
    
    # Extract and save all product links
    product_links = extract_product_links(list_page_url)

    # Read the product links from the file and visit each page
    with open('product_links.txt', 'r') as file:
        for line in file:
            product_url = line.strip()
            if product_url:
                visit_product_page_and_download_images(product_url)

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


In [33]:
import os
import time
import requests
import zipfile
import urllib.request
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Path to your WebDriver executable
webdriver_path = 'D:/Internship_Developers_den/web_scraping/folder_driver/chromedriver-win64/chromedriver.exe'

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment if you want to run it headlessly

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(webdriver_path), options=chrome_options)

def sanitize_filename(filename):
    """ Remove or replace invalid characters from filenames/folders """
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    return filename

def download_image(image_url, folder_name, idx):
    try:
        # Ensure the image URL is absolute
        if not image_url.startswith("http"):
            image_url = "https://www.poltronafrau.com" + image_url

        # Create the download folder using the sanitized folder name
        download_folder = os.path.join(os.getcwd(), folder_name)
        os.makedirs(download_folder, exist_ok=True)

        # Download the image
        response = requests.get(image_url)
        if response.status_code == 200:
            image_path = os.path.join(download_folder, f'image_{idx + 1}.png')
            with open(image_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded image {idx + 1} in folder '{folder_name}' from URL: {image_url}")
        else:
            print(f"Failed to download image {idx + 1}, status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading image {idx + 1}: {e}")

def download_images_from_product_page(folder_name):
    try:
        # Locate the image elements with the srcset attribute
        img_selector = "img.cmp-image__image"
        image_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, img_selector))
        )

        for idx, img_element in enumerate(image_elements):
            # Extract the srcset attribute (which contains URLs for different image resolutions)
            srcset = img_element.get_attribute('srcset')
            if not srcset:
                # Fallback to data-srcset if srcset is not present
                srcset = img_element.get_attribute('data-srcset')

            if srcset:
                # Split the srcset string into individual image URLs and resolutions
                srcset_items = [item.strip() for item in srcset.split(",")]

                # Extract the highest resolution image (assuming it's the last in the srcset)
                highest_res_image = srcset_items[-1].split()[0]
                print(f"Highest resolution image URL: {highest_res_image}")

                # Download the highest-resolution image
                download_image(highest_res_image, folder_name, idx)
            else:
                # If no srcset is found, attempt to download the standard src image
                src = img_element.get_attribute('src')
                if src:
                    print(f"Falling back to src image: {src}")
                    download_image(src, folder_name, idx)
                else:
                    print(f"No src or srcset found for image {idx + 1}")

    except TimeoutException as e:
        print(f"Error locating images: {e}")

def get_folder_name_from_url(product_url):
    """Extract folder name from the product URL"""
    try:
        # Extract the folder name from the URL after 'products/' and before '.html'
        part_after_products = product_url.split('products/')[1]
        folder_name = part_after_products.split('.html')[0]
        return sanitize_filename(folder_name)
    except Exception as e:
        print(f"Error extracting folder name: {e}")
        return "default_folder"

def visit_product_page_and_download_images(product_url):
    try:
        print(f"Visiting product page: {product_url}")
        driver.get(product_url)

        # Wait for the page to load
        time.sleep(5)

        # Get the folder name based on the product URL
        folder_name = get_folder_name_from_url(product_url)

        # Download images from the product page
        download_images_from_product_page(folder_name)

        # After downloading images, click on the "Downloads" tab
        click_download_tab()

        # After clicking the download tab, download the ZIP file and extract its contents
        download_and_extract_zip_file(folder_name)

        # Download additional files if available
        download_additional_files(folder_name)

    except Exception as e:
        print(f"Error visiting product page {product_url}: {e}")

def click_download_tab():
    """Clicks the 'Downloads' tab based on the provided selector"""
    try:
        # Wait for the tab element to be clickable and then click it
        tab_selector = "#producttabs-3baf65de06-item-25a16ecc4d-tab"
        download_tab = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, tab_selector))
        )
        download_tab.click()
        print("Successfully clicked the 'Downloads' tab.")
        
    except TimeoutException as e:
        print(f"Error clicking the 'Downloads' tab: {e}")

def download_and_extract_zip_file(folder_name):
    """Download the ZIP file from the newly appeared <a> tag and extract it into the same folder"""
    try:
        # Wait for the <a> tag to become visible
        a_tag_selector = "#professionals > div > div > div:nth-child(1) > div > h3 > button > a"
        a_tag_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, a_tag_selector))
        )

        # Get the URL from the href attribute
        file_url = a_tag_element.get_attribute('href')
        if file_url:
            print(f"Found ZIP file download URL: {file_url}")

            # Create a downloads folder if it doesn't exist
            download_folder = os.path.join(os.getcwd(), folder_name)
            os.makedirs(download_folder, exist_ok=True)

            # Download the ZIP file with headers and cookies
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
                'Referer': driver.current_url,
            }
            # Extract cookies from the Selenium session
            cookies = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()}

            # Download the ZIP file
            response = requests.get(file_url, headers=headers, cookies=cookies)
            if response.status_code == 200:
                # Extract the ZIP file in-memory
                with zipfile.ZipFile(BytesIO(response.content)) as zip_file:
                    # Extract all contents to the download folder
                    zip_file.extractall(download_folder)
                    print(f"Extracted ZIP contents to folder '{folder_name}'")

                    # Process the PDF file inside the ZIP file
                    for file_name in zip_file.namelist():
                        if file_name.endswith('.pdf'):
                            pdf_path = os.path.join(download_folder, file_name)
                            print(f"Found PDF file: {pdf_path}")
                            # Process the PDF file as needed (e.g., move, read, etc.)
            else:
                print(f"Failed to download ZIP file, status code: {response.status_code}")
        else:
            print("No href found for the download link.")
        
    except TimeoutException as e:
        print(f"Error locating the download link: {e}")


def download_additional_files(folder_name):
    """Download additional files linked in the product page from multiple possible selectors"""
    try:
        # Define selectors to attempt
        primary_selector = '#professionals > div > div > div:nth-child(1) > div > h3 > button > a'
        secondary_selector = '#professionals > div > div > div:nth-child(1) > div:nth-child(2) > h3 > button > a'
        
        download_url = None

        # Attempt to find and click the primary selector
        try:
            download_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, primary_selector))
            )
            download_url = download_button.get_attribute('href')
            print(f"Primary download URL found: {download_url}")
        except TimeoutException:
            print("Primary download button not found, trying secondary selector.")

        # If primary selector fails, attempt to find and click the secondary selector
        if not download_url:
            try:
                download_button = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, secondary_selector))
                )
                download_url = download_button.get_attribute('href')
                print(f"Secondary download URL found: {download_url}")
            except TimeoutException:
                print("Secondary download button not found or the page took too long to load.")
                return  # Exit if neither URL is found

        # Download the ZIP file using urllib and custom headers to bypass protection
        if download_url:
            req = urllib.request.Request(download_url)
            req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
            req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
            req.add_header('Accept-Language', 'en-US,en;q=0.5')

            # Open the URL and read the response
            with urllib.request.urlopen(req) as response:
                if response.status == 200:
                    print("Successfully fetched the file.")
                    zip_data = response.read()

                    # Define the path for saving the ZIP file
                    zip_file_path = os.path.join(os.getcwd(), folder_name, 'additional_files.zip')
                    with open(zip_file_path, 'wb') as f:
                        f.write(zip_data)

                    # Extract the ZIP file into the folder
                    with zipfile.ZipFile(zip_file_path) as z:
                        z.extractall(os.path.join(os.getcwd(), folder_name))
                    print("File downloaded and extracted successfully into the folder.")
                else:
                    print(f"Failed to download the file. Status code: {response.status}")

    except TimeoutException:
        print("Download button not found or the page took too long to load.")


# Example usage
try:
    list_page_url = 'https://www.poltronafrau.com/ww/en/products/products.87.html?_gl=1*7lpixs*_up*MQ..*_ga*MTkwNTg1OTczLjE3MjYwNDI4MDY.*_ga_YGJJL14S4G*MTcyNjA0MjgwNS4xLjAuMTcyNjA0MjgwNS4wLjAuMA..&selectedFilters=pf_info_categoria&pf_info_categoria=11307'
    
    # Extract and save all product links
    product_links = extract_product_links(list_page_url)

    # Read the product links from the file and visit each page
    with open('product_links.txt', 'r') as file:
        for line in file:
            product_url = line.strip()
            if product_url:
                visit_product_page_and_download_images(product_url)

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


Visiting list page: https://www.poltronafrau.com/ww/en/products/products.87.html?_gl=1*7lpixs*_up*MQ..*_ga*MTkwNTg1OTczLjE3MjYwNDI4MDY.*_ga_YGJJL14S4G*MTcyNjA0MjgwNS4xLjAuMTcyNjA0MjgwNS4wLjAuMA..&selectedFilters=pf_info_categoria&pf_info_categoria=11307
Found 6 product links.
Visiting product page: https://www.poltronafrau.com/ww/en/products/obi-drawer-chest.html
Falling back to src image: https://www.poltronafrau.com/content/experience-fragments/poltronafrau/ww/en/site/header/master/_jcr_content/root/header/logoImage.coreimg.65.768.jpeg/1690973285872/logo.jpeg
Downloaded image 1 in folder 'obi-drawer-chest' from URL: https://www.poltronafrau.com/content/experience-fragments/poltronafrau/ww/en/site/header/master/_jcr_content/root/header/logoImage.coreimg.65.768.jpeg/1690973285872/logo.jpeg
Falling back to src image: https://www.poltronafrau.com/etc.clientlibs/poltronafrau/clientlibs/clientlib-site/resources/icons/heart2.svg
Downloaded image 2 in folder 'obi-drawer-chest' from URL: http