In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import re

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Optional, run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

def get_product_details(product_url):
    driver.get(product_url)
    time.sleep(3)  # Wait for the page to load

    # Load the HTML content of the page
    product_soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Product title
    title_element = product_soup.find('span', {'id': 'productTitle'})
    title = title_element.text.strip() if title_element else 'Product title not found'

    # Product price
    price_whole_element = product_soup.find('span', {'class': 'a-price-whole'})
    price_fraction_element = product_soup.find('span', {'class': 'a-price-fraction'})
    
    if price_whole_element and price_fraction_element:
        # Format price as a decimal number with a single dot
        price = f"{price_whole_element.text.strip()}.{price_fraction_element.text.strip()}"
    else:
        price_element_alt = product_soup.find('span', {'class': 'a-price aok-align-center'})
        if price_element_alt:
            price = price_element_alt.find('span', {'class': 'a-offscreen'}).text.strip()
            # Remove the $ symbol and any commas, then convert to decimal
            price = re.sub(r'[^\d.]', '', price)
        else:
            price = '0.00'  # Default value if price is not found

    # Fix potential double dot issue in price
    price = re.sub(r'\.\.', '.', price)

    try:
        price_float = float(price)
    except ValueError:
        price_float = 0.00  # Set price to 0.00 if conversion fails

    return {
        'title': title,
        'price/$': price_float
    }

def collect_product_links(start_url):
    driver.get(start_url)
    time.sleep(3)  # Wait for the page to load

    # Load the HTML content of the page
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all product links
    product_links = []
    for link in soup.find_all('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'):
        href = link.get('href')
        if href and '/dp/' in href:
            full_url = f"https://www.amazon.com{href}"
            product_links.append(full_url)

    return product_links

# Base URL template for pages
base_url_template = 'https://www.amazon.com/s?k=Computer+Audio+%26+Video+Accessories&i=computers&rh=n%3A11548951011&page={}&c=ts&qid=1722601561&ts_id=11548951011&ref=sr_pg_{}'

# Category name derived from the initial URL
category_name = "Computer Audio & Video Accessories"

# Collect all product details from all 24 pages
all_products = []

for page_number in range(1, 25):  # Pages from 1 to 24
    page_url = base_url_template.format(page_number, page_number)
    product_links = collect_product_links(page_url)

    for link in product_links:
        product_details = get_product_details(link)
        all_products.append(product_details)

# Export all data to a single Excel file
output_filename = f'{category_name.replace(" ", "_")}.xlsx'
df = pd.DataFrame(all_products)
df.to_excel(output_filename, index=False)

# Close the browser
driver.quit()

print(f"Data has been successfully collected from all 24 pages and saved to {output_filename}.")


Data has been successfully collected from all 24 pages and saved to an Excel file by categories.
