## This script´s goal is to gather information for a certain product

### 1) First we´ll get the links of a lot of these products

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup  # Import BeautifulSoup


# Configure Selenium WebDriver
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Run headless (without opening browser)
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL of the Amazon search page
url = "https://www.amazon.es"

# Go to the Amazon search page
driver.get(url)

In [2]:
# Product to search
wait = WebDriverWait(driver, 10)  # Timeout after 10 seconds
search_box = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="twotabsearchtextbox"]')))
search_box.send_keys("aspiradora inalámbrica")  # Enter the search term
search_box.send_keys(Keys.RETURN)

In [3]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time

# Function to navigate through Amazon search pages and extract data
def scrape_amazon_pages(driver, base_url,max_pages=5):
    page = 1
    product_infos = []
    missed_products=[]

    while page <= max_pages:
        # Extract product information from the current page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        products = soup.find_all('div', {'data-cy': 'title-recipe'})


        initial_length = len(product_infos)
        initial_missed_length = len(missed_products)
        for product in products:
            try:
                # Title and link extraction as in the previous code
                title_span = product.select_one("span[class*='a-text-normal']")
                title = title_span.get_text(strip=True) if title_span else "Title Not Found"
                
                link_tag = product.select_one("a[href*='/dp/']")
                link = link_tag['href'] if link_tag else None
                full_link = f"{base_url}{link}" if "https://" not in link else (link if link else "Link Not Found")

                if title != "Title Not Found" and link:
                    product_infos.append({"title": title, "link": full_link})

            except AttributeError:
                missed_products.append(product)
            
        print(f"Found {len(product_infos) - initial_length} products on the page, and missed {len(missed_products) - initial_missed_length} products.")

        if page < max_pages:
            # Try to navigate to the next page
            try:
                next_button = driver.find_element("css selector", "a.s-pagination-next")
                driver.execute_script("arguments[0].click();", next_button)
                time.sleep(5)  # Wait for the page to load
            except NoSuchElementException:
                print("Next button not found or disabled, stopping pagination.")
                break
        page += 1

    return product_infos

## Call the scraping function
products_data = scrape_amazon_pages(driver,url, max_pages=50)
print("Total products collected:", len(products_data))


Found 50 products on the page, and missed 0 products.
Found 49 products on the page, and missed 0 products.
Found 49 products on the page, and missed 0 products.
Found 49 products on the page, and missed 0 products.
Found 49 products on the page, and missed 0 products.
Found 49 products on the page, and missed 0 products.
Found 19 products on the page, and missed 0 products.
Next button not found or disabled, stopping pagination.
Total products collected: 314


In [5]:
import pandas as pd
import os 
os.makedirs("Databases", exist_ok=True)
products_information=pd.DataFrame(products_data)
products_information.to_csv("Databases/products_information.csv", index=False)

### 2) Now we´ll go to each of those pages and retrieve the information

In [24]:
import pandas as pd
products_information=pd.read_csv("Databases/products_information.csv")

In [25]:
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Run headless (without opening browser)
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [26]:
from selenium import webdriver

save_dir="Webpages/Aspiradoras"
os.makedirs(save_dir, exist_ok=True)


for title,link in zip(products_information["title"], products_information["link"]):
    save_location=f"{save_dir}/{title[:40].replace("/","").replace("|","")}.html"
    if os.path.exists(save_location):
        continue
    driver.get(link)
    html_content = driver.page_source
    with open(save_location, "w", encoding="utf-8") as file:
        file.write(html_content)

    time.sleep(3)

In [29]:
products_information.drop_duplicates().shape

(314, 2)