In [8]:
import requests
from bs4 import BeautifulSoup
import time
import random

In [9]:
import sys, os
sys.path.append(os.path.join(os.path.dirname('__file__'), '..', 'DB_and_Azure'))
import sql_db_functions as SQLf

In [10]:
def get_soup(url, retries=3):
    """Fetch the page content and return the BeautifulSoup object."""
    for _ in range(retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return BeautifulSoup(response.text, 'html.parser')
            else:
                print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        except Exception as e:
            print(f"Error fetching the URL: {e}")
        time.sleep(random.uniform(1, 3))  # Random delay between retries
    return None



def scrape_prada_product(url):
    soup = get_soup(url)
    if not soup:
        return None

    product = {}

    # Add link
    product['link'] = url

    # Extract product name
    title_element = soup.find('h1', {'data-element': 'product-title'})
    if title_element:
        product['name'] = title_element.text.strip()

    # Extract product price
    price_element = soup.find('p', {'data-element': 'product-current-price'})
    if price_element:
        product['price'] = price_element.text.strip()

    # Extract product color
    color_element = soup.find('div', class_='block mb-sp-8 lg:mb-sp-12 leading-none')
    if color_element:
        color_name = color_element.find_all('p')[1].text.strip()
        product['color'] = color_name

    # Extract product sizes
    sizes = []
    size_elements = soup.select('select[data-element="sizepicker"] option')
    for option in size_elements:
        if option.get('disabled') is None:  # Only add sizes that are not disabled
            sizes.append(option.text.strip())
    product['sizes'] = sizes

    # Extract product images (first three)
    images = []
    image_elements = soup.find_all('img', class_='pdp-product-img', limit=3)
    for img in image_elements:
        if 'data-srcset' in img.attrs:
            images.append(img['data-srcset'].split(', ')[-1].split(' ')[0])  # Take the highest resolution image URL

    product['images'] = images

    # Extract product details
    details_element = soup.find('div', {'data-element': 'product-details'})
    if details_element:
        product['details'] = details_element.text.strip()

    # Extract composition
    composition_element = soup.find('div', class_='product-composition')
    if composition_element:
        product['composition'] = composition_element.text.strip()

    return product



In [11]:
import re

def get_price(prod_soup):




    # Remove any non-numeric characters except for ',' and '.'
    cleaned_text = re.sub(r'[^\d,\.]', '', text)
    
    # Replace comma with a period if there's no period already (to handle decimal part)
    if ',' in cleaned_text and '.' not in cleaned_text:
        cleaned_text = cleaned_text.replace(',', '.')

    elif ',' not in cleaned_text and '.' in cleaned_text:
        cleaned_text = cleaned_text.replace('.', '')

    elif ',' in cleaned_text and '.' in cleaned_text:
        # If both ',' and '.' are present, keep only the period as the decimal separator
        cleaned_text = cleaned_text.replace('.', '')
        cleaned_text = cleaned_text.replace(',', '.')
    
    # Convert the string to a float
    number = float(cleaned_text)
    
    return number

In [13]:
def scrape_catalog(url, Testing):
    soup = get_soup(url)
    if not soup:
        return []

    product_links = [a['href'] for a in soup.select('a[href*="/it/en/p/"]')]
    base_url = "https://www.prada.com"
    full_product_links = [base_url + link for link in product_links]


    for link in product_links:
        print(f"Scraping {link}")
        product_data = scrape_prada_product(link)

        conn, cursor = SQLf.sql_db_functions.connect_sql()

        SQLf.sql_db_functions.insert_description_image_to_db(
            conn=conn,
            cursor=cursor,
            brand='Prada',
            descript=product_data['details'],
            price=product_data['price'],
            prod_link = product_data['link'],
            Clothing_type = 'Shirts and tops',
            images_links=product_data['images'],
            Testing = Testing
        )

        conn.close()
        cursor.close()


        #if product_data:
        #    products.append(product_data)
        time.sleep(random.uniform(3, 6))  # Random delay between product page scrapes

        return product_data


    #return product_data

In [None]:
# URL of the Prada catalog page (example)
catalog_url = 'https://www.prada.com/it/en/womens/ready-to-wear/shirts-and-tops/c/10058EU:1'
product_data = scrape_catalog(catalog_url, Testing = True)

if product_data:
    for product in product_data:
        print(product)