### 1- import libraries

In [1]:
from bs4 import BeautifulSoup
import requests 
import time
import datetime
import smtplib
import csv

### 2- requests handling

some collected infos:
- The User-Agent request header is a characteristic string that lets servers and network peers identify the application, operating system, vendor, and/or version of the requesting user agent. 

In [12]:
# URL and headers
URL = 'https://www.amazon.co.uk/Johnsons-Cotton-Buds-200/dp/B09843WY1B/ref=zg_bs_g_baby_d_sccl_3/257-9862628-6554964'
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

# Fetching the page content
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")

# Extracting product details
product_id = URL.split('/dp/')[1].split('/')[0]  # Extract product ID from URL
product_title = soup.find(id='productTitle').get_text()
product_description = soup.find(id='feature-bullets').get_text().strip() if soup.find(id='feature-bullets') else 'Description not available'
store = 'Amazon UK'  # Store name, generally static for Amazon
rating = soup.find(id='averageCustomerReviews').get_text().strip() if soup.find(id='averageCustomerReviews') else 'Rating not available'

# Initialize variables
i = 0
size_price_dict = {}

while True:
    # Try to find size and price elements by their IDs
    size_id = f'size_name_{i}'
    price_id = f'size_name_{i}_price'
    
    size_element = soup.find(id=size_id)
    price_element = soup.find(id=price_id)
    
    # Break the loop if no more sizes or prices are found
    if not size_element or not price_element:
        break
    
    # Get and clean text
    size = size_element.get_text().strip().split("\n")[0].strip()  # Split by line and take the first part
    price = price_element.get_text().strip().replace("1 option from ", "").strip()
    
    # Add to dictionary
    size_price_dict[size] = price
    
    i += 1

# Print product details
print(f"Product ID: {product_id}")
print(f"Product Name: {product_title}")
print(f"Description: {product_description}")
print(f"Store: {store}")
print(f"Rating: {rating}\n")

# Print the size and price dictionary
for size, price in size_price_dict.items():
    print(f"Size: {size}\nPrice: {price}\n")

Product ID: B09843WY1B
Product Name:         Johnson's Baby Cotton Buds 100s (Pack of 12)       
Description: Gentle for use in delicate areas around the eyes and the outer ear    Cleans in between baby’s fingers, toes and other creases on the skin    Naturally absorbent with 100% pure cotton tips    100% paper sticks    Plastic-free recyclable packaging
Store: Amazon UK
Rating: 4.7  4.7 out of 5 stars    
    22,801 ratings

Size: 100 Count (Pack of 12)
Price: 5 options from £13.60

Size: 200 Count (Pack of 1)
Price: 5 options from £3.49

Size: 200 Count (Pack of 3)
Price: £3.75

Size: Small
Price: 2 options from £3.69



In [3]:
# URL and headers
URL = 'https://www.amazon.co.uk/Amazon-Brand-Sensitive-Unscented-wipes/dp/B07V2N4SJY/ref=zg_bs_g_baby_d_sccl_1/257-9862628-6554964?psc=1'
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

# Fetching the page content
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")

# Extracting product details
product_id = URL.split('/dp/')[1].split('/')[0]  # Extract product ID from URL
product_title = soup.find(id='productTitle').get_text().strip()
product_description = soup.find(id='feature-bullets').get_text().strip() if soup.find(id='feature-bullets') else 'Description not available'
store = 'Amazon UK'  # Store name, generally static for Amazon
rating = soup.find(id='averageCustomerReviews').get_text().strip() if soup.find(id='averageCustomerReviews') else 'Rating not available'

# Initialize variables
i = 0
size_price_dict = {}

while True:
    # Try to find size and price elements by their IDs
    size_id = f'size_name_{i}'
    price_id = f'size_name_{i}_price'
    
    size_element = soup.find(id=size_id)
    price_element = soup.find(id=price_id)
    
    # Break the loop if no more sizes or prices are found
    if not size_element or not price_element:
        break
    
    # Get and clean text
    size = size_element.get_text().strip().split("\n")[0].strip()  # Split by line and take the first part
    price = price_element.get_text().strip().replace("1 option from ", "").strip()
    
    # Add to dictionary
    size_price_dict[size] = price
    
    i += 1

# Print product details
print(f"Product ID: {product_id}")
print(f"Product Name: {product_title}")
print(f"Description: {product_description}")
print(f"Store: {store}")
print(f"Rating: {rating}\n")

# Print the size and price dictionary
for size, price in size_price_dict.items():
    print(f"Size: {size}\nPrice: {price}\n")

Product ID: B07V2N4SJY
Product Name: Amazon Brand – Mama Bear Sensitive Unscented Baby Wipes, 1008 Count (18 Packs of 56)
Description: About this item    MULTI-BENEFIT PACK: 18 packs; 56 water-based wipes per pack    PERFUME FREE: Fragrance-free, pre-moistened wipes    FOR SENSITIVE SKIN: Gently formulated and hypoallergenic for baby's delicate skin    VERSATILE: For hands, face and the diaper area    SOOTHING INGREDIENTS: Made with aloe vera and chamomile    RECOMMENDED BY DERMATOLOGISTS: Dermatologically approved, balanced pH value
Store: Amazon UK
Rating: 4.8  4.8 out of 5 stars    
    25,825 ratings

Size: 56 count (Pack of 6)
Price: £5.17

Size: 56 count (Pack of 18)
Price: £13.65

Size: 64 count (Pack of 12)
Price: £11.44



In [13]:
def fetch_page(url, headers):
    """Fetches the page content from the given URL."""
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check for HTTP request errors
        return response.content
    except requests.RequestException as e:
        print(f"Error fetching page: {e}")
        return None

def extract_text(element, default='Not available'):
    """Safely extracts text from a BeautifulSoup element."""
    return element.get_text(strip=True) if element else default

def extract_product_details(soup, product_id):
    """Extracts product details from BeautifulSoup object."""
    product_title = extract_text(soup.find(id='productTitle'))
    description_element = soup.find('div', {'id': 'feature-bullets'})
    product_description = extract_text(description_element)
    rating_element = soup.find(id='averageCustomerReviews')
    rating = extract_text(rating_element)
    return product_title, product_description, rating

def extract_size_price(soup):
    """Extracts sizes and prices from BeautifulSoup object."""
    size_price_dict = {}
    i = 0
    while True:
        size_id = f'size_name_{i}'
        price_id = f'size_name_{i}_price'
        
        size_element = soup.find(id=size_id)
        price_element = soup.find(id=price_id)
        
        if not size_element or not price_element:
            break
        
        size = extract_text(size_element).split("\n")[0].strip()
        price = extract_text(price_element).replace("1 option from ", "").strip()
        
        size_price_dict[size] = price
        i += 1
    return size_price_dict

def scrape_and_save(urls, headers, output_file):
    """Scrapes product details from multiple URLs and saves results to a CSV file."""
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Product ID', 'Product Name', 'Description', 'Store', 'Rating', 'Size', 'Price']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for url in urls:
            page_content = fetch_page(url, headers)
            if page_content:
                soup = BeautifulSoup(page_content, "html.parser")
                product_id = url.split('/dp/')[1].split('/')[0]
                product_title, product_description, rating = extract_product_details(soup, product_id)
                size_price_dict = extract_size_price(soup)
                
                for size, price in size_price_dict.items():
                    writer.writerow({
                        'Product ID': product_id,
                        'Product Name': product_title,
                        'Description': product_description,
                        'Store': 'Amazon UK',
                        'Rating': rating,
                        'Size': size,
                        'Price': price
                    })
                print(f"Processed {url}")

# Example usage
urls = [
    'https://www.amazon.co.uk/Amazon-Brand-Sensitive-Unscented-wipes/dp/B07V2N4SJY/ref=zg_bs_g_baby_d_sccl_1/257-9862628-6554964?psc=1',
    'https://www.amazon.co.uk/WaterWipes-Sensitive-Newborn-Biodegradable-Unscented/dp/B08MXSBRSB/ref=zg_bs_g_baby_d_sccl_2/257-9862628-6554964?psc=1',
]

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

output_file = 'products.csv'
scrape_and_save(urls, headers, output_file)

Processed https://www.amazon.co.uk/Amazon-Brand-Sensitive-Unscented-wipes/dp/B07V2N4SJY/ref=zg_bs_g_baby_d_sccl_1/257-9862628-6554964?psc=1
Processed https://www.amazon.co.uk/WaterWipes-Sensitive-Newborn-Biodegradable-Unscented/dp/B08MXSBRSB/ref=zg_bs_g_baby_d_sccl_2/257-9862628-6554964?psc=1
