In [5]:
import requests
from bs4 import BeautifulSoup

def scrape_prada_product(url):
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    product = {}

    # Extract product name
    title_element = soup.find('h1', {'data-element': 'product-title'})
    if title_element:
        product['name'] = title_element.text.strip()

    # Extract product price
    price_element = soup.find('p', {'data-element': 'product-current-price'})
    if price_element:
        product['price'] = price_element.text.strip()

    # Extract product color
    color_element = soup.find('div', class_='block mb-sp-8 lg:mb-sp-12 leading-none')
    if color_element:
        color_name = color_element.find_all('p')[1].text.strip()
        product['color'] = color_name

    # Extract product sizes
    sizes = []
    size_elements = soup.find_all('option')
    for option in size_elements:
        if option.get('disabled') is None:  # Only add sizes that are not disabled
            sizes.append(option.text.strip())
    product['sizes'] = sizes

    # Extract product images (first three)
    images = []
    image_elements = soup.find_all('img', class_='pdp-product-img', limit=3)
    for img in image_elements:
        if 'data-srcset' in img.attrs:
            images.append(img['data-srcset'].split(', ')[-1].split(' ')[0])  # Take the highest resolution image URL

    product['images'] = images

    # Extract product details
    details_element = soup.find('div', {'data-element': 'product-details'})
    if details_element:
        product['details'] = details_element.text.strip()

    # Extract composition
    composition_element = soup.find('div', class_='product-composition')
    if composition_element:
        product['composition'] = composition_element.text.strip()

    return product

# URL of the Prada product page (example)
url = 'https://www.prada.com/it/en/p/striped-poplin-shirt/P436H_15DT_F0013_S_OOO'
product_data = scrape_prada_product(url)

if product_data:
    print(product_data)


{'name': 'Striped poplin shirt', 'price': '€ 1.050', 'color': 'Sapphire Blue', 'sizes': [], 'images': ['https://www.prada.com/content/dam/pradabkg_products/P/P43/P436H/15DTF0013/P436H_15DT_F0013_S_OOO_MDF.jpg/_jcr_content/renditions/cq5dam.web.hebebed.1800.1800.jpg', 'https://www.prada.com/content/dam/pradabkg_products/P/P43/P436H/15DTF0013/P436H_15DT_F0013_S_OOO_MDD.jpg/_jcr_content/renditions/cq5dam.web.hebebed.1800.1800.jpg'], 'details': 'Product detailsThis poplin shirt with a classic menswear silhouette is animated by a stripe motif. The style, characterized by the patch pocket on the chest, is enhanced by the emblematic fabric triangle logo reinterpreted with a conceptual design. Product code: P436H_15DT_F0013_S_OOOMenswear fitShirt collarFront button closureSleeves with shirt cuffsPatch pocket on the frontTriangle logo with sartorial logo label The model is 178 cm tall and wears a size 38Height: 69cm'}


In [6]:
import requests
from bs4 import BeautifulSoup
import time
import random

def get_soup(url, retries=3):
    """Fetch the page content and return the BeautifulSoup object."""
    for _ in range(retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return BeautifulSoup(response.text, 'html.parser')
            else:
                print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        except Exception as e:
            print(f"Error fetching the URL: {e}")
        time.sleep(random.uniform(1, 3))  # Random delay between retries
    return None

def scrape_prada_product(url):
    soup = get_soup(url)
    if not soup:
        return None

    product = {}

    # Extract product name
    title_element = soup.find('h1', {'data-element': 'product-title'})
    if title_element:
        product['name'] = title_element.text.strip()

    # Extract product price
    price_element = soup.find('p', {'data-element': 'product-current-price'})
    if price_element:
        product['price'] = price_element.text.strip()

    # Extract product color
    color_element = soup.find('div', class_='block mb-sp-8 lg:mb-sp-12 leading-none')
    if color_element:
        color_name = color_element.find_all('p')[1].text.strip()
        product['color'] = color_name

    # Extract product sizes
    sizes = []
    size_elements = soup.select('select[data-element="sizepicker"] option')
    for option in size_elements:
        if option.get('disabled') is None:  # Only add sizes that are not disabled
            sizes.append(option.text.strip())
    product['sizes'] = sizes

    # Extract product images (first three)
    images = []
    image_elements = soup.find_all('img', class_='pdp-product-img', limit=3)
    for img in image_elements:
        if 'data-srcset' in img.attrs:
            images.append(img['data-srcset'].split(', ')[-1].split(' ')[0])  # Take the highest resolution image URL

    product['images'] = images

    # Extract product details
    details_element = soup.find('div', {'data-element': 'product-details'})
    if details_element:
        product['details'] = details_element.text.strip()

    # Extract composition
    composition_element = soup.find('div', class_='product-composition')
    if composition_element:
        product['composition'] = composition_element.text.strip()

    return product

def scrape_catalog(url):
    soup = get_soup(url)
    if not soup:
        return []

    product_links = [a['href'] for a in soup.select('a[href*="/it/en/p/"]')]
    base_url = "https://www.prada.com"
    full_product_links = [base_url + link for link in product_links]

    products = []

    for link in full_product_links:
        print(f"Scraping {link}")
        product_data = scrape_prada_product(link)
        if product_data:
            products.append(product_data)
        time.sleep(random.uniform(2, 5))  # Random delay between product page scrapes

    return products

# URL of the Prada catalog page (example)
catalog_url = 'https://www.prada.com/it/en/womens/ready-to-wear/shirts-and-tops/c/10058EU:1'
product_data = scrape_catalog(catalog_url)

if product_data:
    for product in product_data:
        print(product)

Scraping https://www.prada.comhttps://www.prada.com/it/en/p/checked-cotton-bandana-top/P921MR_15GD_F0028_S_OOO
Error fetching the URL: HTTPSConnectionPool(host='www.prada.comhttps', port=443): Max retries exceeded with url: //www.prada.com/it/en/p/checked-cotton-bandana-top/P921MR_15GD_F0028_S_OOO (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x78369d5cc1c0>: Failed to resolve 'www.prada.comhttps' ([Errno -2] Name or service not known)"))
Error fetching the URL: HTTPSConnectionPool(host='www.prada.comhttps', port=443): Max retries exceeded with url: //www.prada.com/it/en/p/checked-cotton-bandana-top/P921MR_15GD_F0028_S_OOO (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x78369d5ccac0>: Failed to resolve 'www.prada.comhttps' ([Errno -2] Name or service not known)"))
Error fetching the URL: HTTPSConnectionPool(host='www.prada.comhttps', port=443): Max retries exceeded with url: //www.prada.com/it/en/p/checked-cotton-bandana-t