In [6]:
import time
import json
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

def write_to_file(data, filename='products_combined.json'):
    with open(filename, 'a', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
        file.write('\n')

def extract_image(main_driver):
    image_dict = {}
    try:
        landing_image = main_driver.find_element(By.ID, "landingImage")
        landing_image = landing_image.get_attribute('src')
        image_dict['Landing_Image'] = landing_image
    except:
        image_dict['Landing_Image'] = None
    images = main_driver.find_elements(By.XPATH, '//div[@id="altImages"]//li[contains(@class, "item")]//img')
    i_list = []
    for image in images:
        image = image.get_attribute('src')
        i_list.append(image)
    image_dict['Other_Images'] = i_list
    return image_dict

def extract_colors(driver, product_info):
    color_list = []
  
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//li[contains(@id, 'color_name_')]"))
    )
    options = driver.find_elements(By.XPATH, "//li[contains(@id, 'color_name_')]")

    for option in options:
        color_span = option.find_element(By.XPATH, ".//span[@class='a-list-item']")
        color_span.click()
        time.sleep(random.uniform(3, 5))

        color_dict = {}
        images = extract_image(driver)
        try:
            color_name = driver.find_element(By.ID, 'variation_color_name').find_element(By.CSS_SELECTOR, 'span.selection').text
            color_dict['Color'] = color_name
            color_dict['Images'] = images
            color_list.append(color_dict)
            
            # Write color information to file
            product_info['Colors'] = color_list
            write_to_file(product_info)
        except Exception as e:
            print("Error extracting color:", e)
            continue
    return color_list

def extract_reviews(main_driver, product_info):
    reviews=[]
    try:
        print("Fetching Reviews")
        review_path=main_driver.find_element(By.XPATH,"//a[normalize-space()='See more reviews']")
        review_link=review_path.get_attribute('href')
        main_driver.execute_script("window.open(arguments[0], '_blank');", review_link)
        main_driver.switch_to.window(main_driver.window_handles[-1])
        time.sleep(random.uniform(3, 5)) 
        for i in range(0,10):
            extract_stars=main_driver.find_elements(By.XPATH,"//i[@data-hook='review-star-rating']")
            extract_content=main_driver.find_elements(By.XPATH,"//span[@data-hook='review-body']")
            for star_element, content_element in zip(extract_stars, extract_content):
                if star_element and content_element is not None:
                    star_rating = star_element.get_attribute("class")
                    review_content = content_element.text
                    review={}
                    review['stars'] = star_rating
                    review['content'] = review_content
                    reviews.append(review)

                    # Write review information to file
                    product_info['Reviews'] = reviews
                    write_to_file(product_info)
            try:
                next_page=main_driver.find_element(By.XPATH,"//a[contains(text(),'Next page')]")
                next_page.click()
                time.sleep(random.uniform(3, 5)) 
            except:
                main_driver.close()
                main_driver.switch_to.window(main_driver.window_handles[-1])
                time.sleep(random.uniform(3, 5)) 
                return reviews
    except:
        time.sleep(random.uniform(3, 5)) 
        return None
    time.sleep(2)
    main_driver.close()
    main_driver.switch_to.window(main_driver.window_handles[-1])
    time.sleep(random.uniform(3, 5)) 
    return reviews

# Initialize the Chrome driver
chrome_options = Options()
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36"
chrome_options.add_argument(f"user-agent={USER_AGENT}")
driver = webdriver.Chrome(options=chrome_options)

driver.get("https://www.amazon.com/s?i=specialty-aps&bbn=16225019011&rh=n%3A7141123011%2Cn%3A16225019011%2Cn%3A679255011&ref=nav_em__nav_desktop_sa_intl_shoes_0_2_13_3")
time.sleep(5)  

# Extract product URLs
product_links = driver.find_elements(By.XPATH, '//div[contains(@class, "s-result-item")]//h2/a')
product_urls = [link.get_attribute('href') for link in product_links]

# Visit each product URL
for url in product_urls[:3]:
    driver.get(url)
    time.sleep(2)  

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    breadcrumbs = soup.select('div#wayfinding-breadcrumbs_feature_div ul a')
    category = breadcrumbs[-2].get_text(strip=True) if len(breadcrumbs) > 1 else None
    sub_category = breadcrumbs[-1].get_text(strip=True) if breadcrumbs else None

    product_info = {
        'product_url': url,
        'Category': category,
        'Sub-Category': sub_category
    }

    details_fields = [
        ('title', 'span', 'id', 'productTitle'),
        ('price', 'span', 'class_', 'a-price'),
        ('rating', 'span', 'class_', 'a-icon-alt'),
        ('reviews', 'span', 'id', 'acrCustomerReviewText')
    ]

    for field in details_fields:
        try:
            element = soup.find(field[1], {field[2]: field[3]})
            product_info[field[0]] = element.text.strip() if element else 'N/A'
        except AttributeError:
            product_info[field[0]] = 'N/A'

    product_description_div = soup.find('div', id='productDescription')
    product_info['product-description'] = product_description_div.get_text(strip=True) if product_description_div else 'N/A'

    write_to_file(product_info)  # Write initial product info to file

    # Extract colors, images, and reviews
    product_info['Colors'] = extract_colors(driver, product_info)
    product_info['Reviews'] = extract_reviews(driver, product_info)

    # Print the scraped data
    print(f"Scraped: {product_info['title']} - Category: {category}, Sub-Category: {sub_category}")

# Close the driver
driver.quit()


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6B80F3E42+31618]
	(No symbol) [0x00007FF6B806B0A9]
	(No symbol) [0x00007FF6B7F2888A]
	(No symbol) [0x00007FF6B7F78524]
	(No symbol) [0x00007FF6B7F7862C]
	(No symbol) [0x00007FF6B7FBF787]
	(No symbol) [0x00007FF6B7F9D14F]
	(No symbol) [0x00007FF6B7FBCA80]
	(No symbol) [0x00007FF6B7F9CEB3]
	(No symbol) [0x00007FF6B7F6A46B]
	(No symbol) [0x00007FF6B7F6B001]
	GetHandleVerifier [0x00007FF6B83FA01D+3202397]
	GetHandleVerifier [0x00007FF6B8446A3D+3516285]
	GetHandleVerifier [0x00007FF6B843C4B0+3473904]
	GetHandleVerifier [0x00007FF6B81A5D46+760454]
	(No symbol) [0x00007FF6B8076B4F]
	(No symbol) [0x00007FF6B8071CE4]
	(No symbol) [0x00007FF6B8071E72]
	(No symbol) [0x00007FF6B806121F]
	BaseThreadInitThunk [0x00007FFDC8E77034+20]
	RtlUserThreadStart [0x00007FFDCA822651+33]
