In [2]:
pip install --user webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver_manager
Successfully installed webdriver_manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import time
import re
import os
import random
import pandas as pd
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException

# Scraping link

In [None]:
data =[]
extracted_data = []

def cookie_handler_button(driver, cookie_accept_button_selector):
    try:
        print("Checking for and attempting to dismiss cookie consent banner...")
        time.sleep(3)
        cookie_buttons = driver.find_elements(By.CSS_SELECTOR, cookie_accept_button_selector)
        if cookie_buttons:
            cookie_buttons[0].click()
            print("Cookie consent banner dismissed.")
            time.sleep(3)
        else:
            print("No cookie consent banner found or dismissed.")
    except Exception as e:
        print(f"Could not dismiss cookie consent banner: {e}")
    return None


def product_finder(driver, url):
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, product_container_selector)))
        print("User review elements selector found. Attempting to find all matching elements.")
        product_elements_list = driver.find_elements(By.CSS_SELECTOR, single_product_selector)
        
        if product_elements_list:
            print(f"Found {len(product_elements_list)} products.")
            for index, product_element in enumerate(product_elements_list):
                print(f"\n--- Product {index + 1} ---")
                # print(product_element.get_attribute('outerHTML')) # You can comment this out if you don't need the full HTML printed anymore

                product_url = "Not found"
                product_title = "Not found"

                try:
                    # Find the <a> tag with class 'poly-component__title' WITHIN the current product_element
                    # This is a direct child of h3.poly-component__title-wrapper,
                    # which is a child of div.poly-card__content
                    # A more specific selector could be 'div.poly-card__content h3.poly-component__title-wrapper a.poly-component__title'
                    # But 'a.poly-component__title' is often sufficient if it's unique enough within the product_element.
                    link_element = product_element.find_element(By.CSS_SELECTOR, 'a.poly-component__title')
                    product_url = link_element.get_attribute('href')
                    product_title = link_element.text # Get the visible text of the link (usually the product title)

                    print(f"Title: {product_title}")
                    print(f"URL: {product_url}")

                    # Optional: Store the data
                    extracted_data.append({'title': product_title, 'url': product_url})

                except NoSuchElementException:
                    print("Could not find the title/link element for this product.")
                except Exception as e_inner:
                    print(f"An error occurred while extracting details for this product: {e_inner}")
        else:
            print("No products found matching the selector.")
            
    except Exception as e:
        print(f"Error in product_finder: {e}")
        
    return None

if __name__ == "__main__":
    # --- Setup ---
    try:
        service = Service(ChromeDriverManager().install())
        print("Chrome WebDriver installed and initialized using WebDriver Manager.")
    except Exception as e:
        print(f"Error initializing Chrome WebDriver with WebDriver Manager: {e}")
        print("Please ensure Chrome is installed and webdriver-manager is correctly installed (`pip install webdriver-manager`).")
        exit()
        
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 20) # Increased wait time again
    #url = "https://listado.mercadolibre.com.mx/distribuidor-autorizado_Desde_49_NoIndex_True?sb=all_mercadolibre"  distribuidor-autorizado
    
    #-- keyword in the research bar: smartphone -- 
    #url="https://listado.mercadolibre.com.mx/smartphone?sb=all_mercadolibre#D[A:smartphone]" first 9 pages
    #url="https://listado.mercadolibre.com.mx/celulares-telefonia/celulares-smartphones/smartphone_Desde_451_NoIndex_True?sb=all_mercadolibre" #other 3page
    #url="https://listado.mercadolibre.com.mx/celulares-telefonia/celulares-smartphones/smartphone_Desde_601_NoIndex_True?sb=all_mercadolibre"#other8
    #url="https://listado.mercadolibre.com.mx/celulares-telefonia/celulares-smartphones/smartphone_Desde_951_NoIndex_True?sb=all_mercadolibre" #other 6
    
    # -- keyword in the research bar: iphone --
    #url="https://listado.mercadolibre.com.mx/iphone?sb=all_mercadolibre#D[A:iphone]" up to 8
    #url="https://listado.mercadolibre.com.mx/celulares-telefonia/celulares-smartphones/iphone_Desde_401_NoIndex_True?sb=all_mercadolibre" other 6
    #url="https://listado.mercadolibre.com.mx/celulares-telefonia/celulares-smartphones/iphone_Desde_701_NoIndex_True?sb=all_mercadolibre" other 9
    #url="https://listado.mercadolibre.com.mx/celulares-telefonia/celulares-smartphones/iphone_Desde_1151_NoIndex_True?sb=all_mercadolibre" other 8
    driver.get(url)

    #Selectors
    cookie_accept_button_selector = "[data-testid='action:understood-button']"
    product_container_selector = "ol.ui-search-layout"
    single_product_selector = 'li.ui-search-layout__item'
    next_page_button_selector = "a.andes-pagination__link[title='Siguiente']"
    
    cookie_handler_button(driver, cookie_accept_button_selector)
    
        # --- PAGINATION LOGIC ---
    max_pages_to_scrape_total = 40 
    current_page_count = 0 

    while current_page_count < max_pages_to_scrape_total:
        current_page_count += 1
        print(f"\n--- Scraping Page {current_page_count} ---")

        if current_page_count > 1: # If it's not the first page, we'd have navigated via button
            # Optionally, add a small wait here to ensure page elements are settled
            time.sleep(3) # Adjust as necessary
            # Re-check for cookie banner just in case it reappears on navigation, though unlikely
            # cookie_handler_button(driver, cookie_accept_button_selector)
        product_finder(driver, driver.current_url) # Pass current URL or None
        
        if current_page_count >= max_pages_to_scrape_total:
            print(f"Reached maximum of {max_pages_to_scrape_total} pages to scrape.")
            break

# --- Navigate to the Next Page ---
        print(f"\nAttempting to navigate from page {current_page_count} to next page...")
        navigation_successful_for_new_content = False
        original_url = driver.current_url

        try:
            # --- ATTEMPT 1 to find and click the button ---
            print("Scrolling to bottom of page to find 'Next' button (Attempt 1)...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2.5) 

            print(f"Looking for next page button with CSS selector: '{next_page_button_selector}' (Attempt 1)")
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, next_page_button_selector)))
            
            # Optional: Scroll the specific element into view. 
            # If this causes issues, you can try removing it, as element_to_be_clickable should handle visibility.
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'auto', block: 'center', inline: 'center'});", next_button)
            time.sleep(0.5) # Shorter pause after specific scroll

            print("Clicking next page button (Attempt 1)...")
            next_button.click() # Using your preferred click method
            
            # --- Verification of Navigation ---
            print(f"Click command sent. Verifying navigation to page {current_page_count + 1}...")
            try:
                WebDriverWait(driver, 10).until(EC.url_changes(original_url))
                print(f"URL successfully changed to: {driver.current_url}")
                # Now that URL has changed, wait for the product container to ensure page is usable
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, product_container_selector)))
                time.sleep(2) # Allow content to render
                navigation_successful_for_new_content = True
            except TimeoutException:
                print(f"URL did not change from '{original_url}' or new content did not load within 10 seconds.")
                if driver.current_url == original_url:
                    print("Navigation failed: Page URL is still the same.")
                # navigation_successful_for_new_content remains False

        except StaleElementReferenceException:
            print("StaleElementReferenceException on Attempt 1. Retrying to find and click...")
            time.sleep(1)
            try:
                # --- ATTEMPT 2 (RETRY) ---
                print("Re-scrolling (Attempt 2)...")
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                print(f"Re-looking for next page button (Attempt 2)...")
                next_button_retry = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, next_page_button_selector)))
                
                driver.execute_script("arguments[0].scrollIntoView({behavior: 'auto', block: 'center', inline: 'center'});", next_button_retry)
                time.sleep(0.5)

                print("Clicking next page button (Attempt 2)...")
                next_button_retry.click()

                print(f"Click command sent (Attempt 2). Verifying navigation to page {current_page_count + 1}...")
                try:
                    WebDriverWait(driver, 10).until(EC.url_changes(original_url))
                    print(f"URL successfully changed on retry to: {driver.current_url}")
                    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, product_container_selector)))
                    time.sleep(2)
                    navigation_successful_for_new_content = True
                except TimeoutException:
                    print(f"URL did not change from '{original_url}' on retry or new content did not load.")
                    if driver.current_url == original_url:
                         print("Navigation failed on retry: Page URL is still the same.")
                    # navigation_successful_for_new_content remains False
            except Exception as e_retry:
                print(f"Failed to click 'Next' button on Attempt 2: {e_retry}")
                # navigation_successful_for_new_content remains False
        
        except TimeoutException: # This catches Timeout from the first attempt's wait.until for the button
            print(f"Next page button not found or not clickable (selector: '{next_page_button_selector}'). Ending pagination.")
            # navigation_successful_for_new_content remains False
        
        except Exception as e: # Catches other errors from Attempt 1 (not Stale or Timeout for finding button)
            print(f"An unexpected error occurred trying to click 'Next' (Attempt 1): {e}")
            # navigation_successful_for_new_content remains False

        if not navigation_successful_for_new_content:
            print("Could not confirm navigation to a new page with new content. Stopping pagination.")
            break # Break from the main while loop
    
    # --- END OF PAGINATION LOGIC ---

    # After the loop, print all collected data
    print("\n--- All Extracted Data ---")
    if extracted_data:
        for item_index, item_data in enumerate(extracted_data):
            # Add page number to the item if your product_finder doesn't do it
            # (Assuming product_finder appends dicts with 'title' and 'url')
            print(f"Item {item_index + 1}: {item_data}")
    
    # --- Close the Browser ---
    print("Closing browser.")
    driver.quit()


Chrome WebDriver installed and initialized using WebDriver Manager.
Checking for and attempting to dismiss cookie consent banner...
Cookie consent banner dismissed.

--- Scraping Page 1 ---
User review elements selector found. Attempting to find all matching elements.
Found 52 products.

--- Product 1 ---
Title: Aiek A8 Pro-a 3.0-inch Mini 3g Red Phone 2+16g Android 9.0 Niños Teléfono Móvil Regalo De Cumpleaños Dar Pegatinas
URL: https://click1.mercadolibre.com.mx/mclics/clicks/external/MLM/count?a=2KAkCrYTb9ZfMuYeYiE%2FhtdwLG60ZZYQXUHUWUHaEgh1Ifhh3REsEmQ0noUx9rs5ycC83kgcvZRKL04%2FFtDME4GrUiVZddrcg7gWwhRLlodpRc5Kqzlb65WhApRImFhtFgvlXPwfvRsN7ffCb%2Bd148la2kSrG6EdlQRkuhjoL6raF5lqDFarOto1%2BhBZ1ZCGVvXhr243H3waTOJzsWauuXexpAc0UkjqvcRYXxxh9FaqWZbjMLEkh4s8a0MSH6Z06qJ6P4b9bk9u4h2gYIcjwy0zEpPwDwK8pr7FHl3aMG%2FR%2BGhv0pig0us6vqNyhHmI6MlJXFf05ImRTYvEc9L1JZ1xH8eAd1TXa7Vy8i8my71R000KYWteFepk%2FWkYhi3EsRtGVGw2tfLFp22G%2Fm4APGud5QewYzpmWP0IVZSr4FZd7RFBshi9rI1%2BRryimscGJrggbsyvrZBLzkth%2FlCEd0X8xWwx

Simplified version (loop change page #)

In [None]:
# --- Global Data List ---
extracted_data = []

# --- Helper function to safely get text and convert ---
def get_element_text_or_default(product_element, selector, default_value, data_type='text'):
    """
    Tries to find an element, get its text, clean it, and return it.
    Returns the default_value if not found or on error.
    """
    try:
        element = product_element.find_element(By.CSS_SELECTOR, selector)
        text = element.text.strip()

        if not text:
            return default_value

        if data_type == 'float':
            # Remove thousands separators (like ',') before converting
            cleaned_text = text.replace(',', '')
            return float(cleaned_text)
        elif data_type == 'int':
            # Use regex to find numbers (handles cases like '(316)')
            match = re.search(r'\d+', text.replace(',', ''))
            return int(match.group(0)) if match else default_value
        else: # 'text'
            return text
    except (NoSuchElementException, ValueError, AttributeError):
        return default_value

# --- Your cookie_handler_button function (Unchanged) ---
def cookie_handler_button(driver, cookie_accept_button_selector):
    try:
        print("Checking for and attempting to dismiss cookie consent banner...")
        wait = WebDriverWait(driver, 5)
        cookie_buttons = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, cookie_accept_button_selector)))
        if cookie_buttons:
            try:
                cookie_buttons[0].click()
            except Exception:
                 print("Normal click failed, trying JavaScript click...")
                 driver.execute_script("arguments[0].click();", cookie_buttons[0])
            print("Cookie consent banner dismissed.")
            time.sleep(2)
        else:
            print("No cookie consent banner found.")
    except TimeoutException:
         print("No cookie consent banner found within the timeout period.")
    except Exception as e:
        print(f"Could not dismiss cookie consent banner: {e}")
    return None

# --- UPDATED product_finder function ---
def product_finder(driver, url, wait, product_container_selector, single_product_selector):
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, product_container_selector)))
        print("Product container found. Attempting to find all product elements.")
        product_elements_list = driver.find_elements(By.CSS_SELECTOR, single_product_selector)

        if product_elements_list:
            print(f"Found {len(product_elements_list)} products on this page.")
            for index, product_element in enumerate(product_elements_list):
                print(f"\n--- Product {index + 1} ---")

                # --- Initialize with defaults ---
                product_url = "Not found"
                product_title = "Not found"
                product_offer_type = "na"
                product_rating = 0.0
                product_review_count = 0
                product_old_price = 0.0
                product_new_price = 0.0

                # --- Extract Title and URL (Essential) ---
                try:
                    link_element = product_element.find_element(By.CSS_SELECTOR, 'a.poly-component__title')
                    product_url = link_element.get_attribute('href')
                    product_title = link_element.text.strip()
                    print(f"Title: {product_title}")
                    print(f"URL: {product_url}")
                except NoSuchElementException:
                    print("Could not find the title/link element for this product. Skipping.")
                    continue # If no title/link, maybe skip this item

                # --- Extract Offer Type ---
                product_offer_type = get_element_text_or_default(
                    product_element, 'span.poly-component__highlight', "na", 'text'
                )
                print(f"Offer Type: {product_offer_type}")

                # --- Extract Rating ---
                # NOTE: The class in the image is 'poly-reviews__rating', NOT 'poly_reviews__rating'
                product_rating = get_element_text_or_default(
                    product_element, 'span.poly-reviews__rating', 0.0, 'float'
                )
                print(f"Rating: {product_rating}")

                # --- Extract Review Count ---
                # NOTE: The class in the image is 'poly-reviews__total', NOT 'poly_reviews__total'
                product_review_count = get_element_text_or_default(
                    product_element, 'span.poly-reviews__total', 0, 'int'
                )
                print(f"Review Count: {product_review_count}")

                # --- Extract Old Price ---
                product_old_price = get_element_text_or_default(
                    product_element, 'div.andes-money-amount--previous span.andes-money-amount__fraction', 0.0, 'float'
                )
                print(f"Old Price: {product_old_price}")

                # --- Extract Discounted/Current Price ---
                product_new_price = get_element_text_or_default(
                    product_element, 'div.poly-price__current span.andes-money-amount__fraction', 0.0, 'float'
                )
                print(f"New Price: {product_new_price}")

                # --- Store all data ---
                extracted_data.append({
                    'title': product_title,
                    'url': product_url,
                    'offer_type': product_offer_type,
                    'rating': product_rating,
                    'review_count': product_review_count,
                    'old_price': product_old_price,
                    'new_price': product_new_price
                })

            return len(product_elements_list) # Return how many were found
        else:
            print("No products found matching the selector on this page.")
            return 0

    except TimeoutException:
        print(f"Error in product_finder: Product container '{product_container_selector}' not found on {url}.")
        return 0
    except Exception as e:
        print(f"Error in product_finder: {e}")
        return 0

# --- Main Execution Block (Adjust as needed) ---
if __name__ == "__main__":
    try:
        service = Service(ChromeDriverManager().install())
        print("Chrome WebDriver installed and initialized.")
    except Exception as e:
        print(f"Error initializing Chrome WebDriver: {e}")
        exit()

    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 20)

    base_url = "https://www.mercadolibre.com.mx/ofertas"

    # --- Selectors (Based on your images) ---
    cookie_accept_button_selector = "[data-testid='action:understood-button']"
    product_container_selector = "div.items-list" 
    single_product_selector = 'div.poly-card--grid-card'

    max_pages_to_scrape_total = 15 # Set how many pages you want to try
    current_page_number = 1

    first_page_url = f"{base_url}?page={current_page_number}"
    print(f"Navigating to the first page: {first_page_url}")
    driver.get(first_page_url)
    cookie_handler_button(driver, cookie_accept_button_selector)

    while current_page_number <= max_pages_to_scrape_total:
        print(f"\n--- Scraping Page {current_page_number} ---")
        current_url = f"{base_url}?page={current_page_number}"

        if current_page_number > 1:
            print(f"Navigating to: {current_url}")
            driver.get(current_url)
            time.sleep(3) # Increased sleep slightly for page load

        products_found = product_finder(
            driver,
            current_url,
            wait,
            product_container_selector,
            single_product_selector
        )

        if products_found == 0 and current_page_number > 1:
            print(f"No products found on page {current_page_number}. Stopping.")
            break

        current_page_number += 1

    print("\n--- All Extracted Data ---")
    if extracted_data:
        for item_index, item_data in enumerate(extracted_data):
            print(f"Item {item_index + 1}: {item_data}")
    else:
        print("No data was extracted.")

    print("\nClosing browser.")
    driver.quit()

Chrome WebDriver installed and initialized.
Navigating to the first page: https://www.mercadolibre.com.mx/ofertas?page=1
Checking for and attempting to dismiss cookie consent banner...
Cookie consent banner dismissed.

--- Scraping Page 1 ---
Product container found. Attempting to find all product elements.
Found 54 products on this page.

--- Product 1 ---
Title: Cubot Kingkong X 5g Dual Sim 512 Gb Negro 16 Gb Ram Android 14
URL: https://www.mercadolibre.com.mx/cubot-kingkong-x-5g-dual-sim-512-gb-negro-16-gb-ram-android-14/p/MLM40378423?pdp_filters=item_id%3AMLM3543571006#polycard_client=offers&deal_print_id=3c545f9a-fde3-4e39-b619-c30bf05fee6a&position=1&tracking_id=5db496e9-2587-47be-904e-035e9bc54e8f&wid=MLM3543571006&sid=offers
Offer Type: OFERTA DEL DÍA
Rating: 4.5
Review Count: 316
Old Price: 0.0
New Price: 5335.0

--- Product 2 ---
Title: Smart Tv Pantalla 32 Pulgadas Weyon Hd Led Android Tv 32wdsnmx
URL: https://www.mercadolibre.com.mx/smart-tv-pantalla-32-pulgadas-weyon-hd-le

## Refine extracted URLs

In [None]:
print("\nProcessing URLs")

for item in extracted_data:
    original_url = item.get('url', '') # Get the URL, default to empty string if not found
    if not original_url: # Skip if URL is empty or missing
        continue

    modified_url = original_url # Assume no change initially
    processed_action = "unchanged" # To describe what happened

    # 1. Check for '?' first
    query_start_index = original_url.find('?')
    if query_start_index != -1:
        # If '?' is found, take the part of the URL before '?' and append '#reviews'
        base_url = original_url[:query_start_index]
        modified_url = base_url + '#reviews'
        processed_action = "processed '?'"
    else:
        # 2. If '?' was NOT found, then check for '#polycard'
        polycard_start_index = original_url.find('#polycard')
        if polycard_start_index != -1:
            # If '#polycard' is found, take the part of the URL before '#polycard' and append '#reviews'
            base_url = original_url[:polycard_start_index]
            modified_url = base_url + '#reviews'
            processed_action = "processed '#polycard'"
            
    # Update the item's URL if it was modified
    if original_url != modified_url:
        item['url'] = modified_url
        print(f"  Original: {original_url}")
        print(f"  Modified: {item['url']} (Action: {processed_action})")
    else:
        print(f"  URL unchanged: {original_url}")

print("URL processing complete.")

print("\n--- All Extracted Data (with refined URL modifications) ---")
if extracted_data:
    for item_index, item_data in enumerate(extracted_data):
        print(f"Item {item_index + 1}: {item_data}")

    #Optional: Save to CSV
    df = pd.DataFrame(extracted_data)
    project_root = Path("C:/Users/giaco/Documents/JAKE/Education/DSS/II Semester/Block 4/scraping_assigment")
    base_data_dir = project_root / "data"
    base_data_dir.mkdir(parents=True, exist_ok=True)

    file_path = base_data_dir / "mercadolibre_products_reviews_links_iphone4.csv"
    print(f"Saving to: {file_path}")
    try:
        df.to_csv(file_path, index=False, encoding='utf-8')
        print("\nData with refined URLs saved to mercadolibre_products_reviews_links_iphone4.csv")
    except Exception as e:
        print(f"Error saving data to CSV: {e}")
else:
    print("No data was extracted.")


Processing URLs
  Original: https://click1.mercadolibre.com.mx/mclics/clicks/external/MLM/count?a=2KAkCrYTb9ZfMuYeYiE%2FhtdwLG60ZZYQXUHUWUHaEgh1Ifhh3REsEmQ0noUx9rs5ycC83kgcvZRKL04%2FFtDME4GrUiVZddrcg7gWwhRLlodpRc5Kqzlb65WhApRImFhtFgvlXPwfvRsN7ffCb%2Bd148la2kSrG6EdlQRkuhjoL6raF5lqDFarOto1%2BhBZ1ZCGVvXhr243H3waTOJzsWauuXexpAc0UkjqvcRYXxxh9FaqWZbjMLEkh4s8a0MSH6Z06qJ6P4b9bk9u4h2gYIcjwy0zEpPwDwK8pr7FHl3aMG%2FR%2BGhv0pig0us6vqNyhHmI6MlJXFf05ImRTYvEc9L1JZ1xH8eAd1TXa7Vy8i8my71R000KYWteFepk%2FWkYhi3EsRtGVGw2tfLFp22G%2Fm4APGud5QewYzpmWP0IVZSr4FZd7RFBshi9rI1%2BRryimscGJrggbsyvrZBLzkth%2FlCEd0X8xWwxzq%2FxiyPYQc7RgkNugJ2jpCkQLJ2ivUnLIua6z7y5LMgGjSS4eswSXNT6Bhcb5EOT3kc%2BStk2LGMqd5XEE0%2B39%2BMWF51ddM2ZZ2SeJuIwRiS3pa0vz%2FWcJdDQOj1BUAlUl12J2i8jF4M4T2P26NVMWtzxs0fGI97XU1rFcL%2FeN7EEyExo2EgZE6fJV9TN2TKUYDsDg4c0vcXwMEQeFfDBecmITKGIj%2BQnv9H9eFffti38ACrgZ0clpYd%2FgHor%2FIRGItgOcHs1bd%2F7QkP5sFnJymBRi00FIrpIr9l460bHeEGG178WQy0FqBIcEAs8asVU5dakqt2MNMLbFhlvrACldVtPS%2BlHfcUTYYbLzKDU5yS0zWmtoszDxooynkGPMEF

## Merging csv

In [4]:
keywords = ["smartphone", "iphone"]

for keyword in keywords:
    if keyword == "smartphone":
        base_name = f"mercadolibre_products_reviews_links"
    else:
        base_name = f"mercadolibre_products_reviews_links_iphone"

    dfs = []
    base_dir = Path("../data/URLs")
    
    for i in range(1,5):
        file_name = f"{base_name}{i}.csv"
        file_path = base_dir / file_name
        
        if file_path.exists():
            try:
                df = pd.read_csv(file_path)
                dfs.append(df)
                print(f"Loaded: {file_path}")
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
        else:
            print(f"File not found: {file_path}")
            
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        combined_path = base_dir / f"{base_name}_combined.csv"
        combined_df.to_csv(combined_path, index=False, encoding='utf-8')
        print(f"📁 Combined CSV saved to: {combined_path}")
    else:
        print(f"⚠️ No files found for keyword: {keyword}")
            
            

File not found: ..\data\URLs\mercadolibre_products_reviews_links1.csv
Loaded: ..\data\URLs\mercadolibre_products_reviews_links2.csv
Loaded: ..\data\URLs\mercadolibre_products_reviews_links3.csv
Loaded: ..\data\URLs\mercadolibre_products_reviews_links4.csv
📁 Combined CSV saved to: ..\data\URLs\mercadolibre_products_reviews_links_combined.csv
File not found: ..\data\URLs\mercadolibre_products_reviews_links_iphone1.csv
Loaded: ..\data\URLs\mercadolibre_products_reviews_links_iphone2.csv
Loaded: ..\data\URLs\mercadolibre_products_reviews_links_iphone3.csv
Loaded: ..\data\URLs\mercadolibre_products_reviews_links_iphone4.csv
📁 Combined CSV saved to: ..\data\URLs\mercadolibre_products_reviews_links_iphone_combined.csv


In [None]:

def classify_urls_in_csv(input_csv_path, output_csv_path):
    """
    Reads a CSV file, classifies URLs based on the presence of "JM",
    and saves the result to a new CSV file.

    Args:
        input_csv_path (str): The path to the input CSV file.
        output_csv_path (str): The path to save the modified CSV file.
    """
    try:
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(input_csv_path)

        # Ensure the 'url' column exists
        if 'url' not in df.columns:
            print(f"Error: The file {input_csv_path} does not have a 'url' column.")
            return

        # Create the 'JM' column
        # 1. Convert 'url' column to string type to handle potential non-string values and NaNs safely.
        # 2. Check if the string "JM" is present in the 'url'. str.contains() returns True/False.
        # 3. Convert boolean (True/False) to integer (1/0).
        df['JM'] = df['url'].astype(str).str.contains("JM").astype(int)

        # Save the modified DataFrame to a new CSV file
        df.to_csv(output_csv_path, index=False)
        print(f"Successfully processed {input_csv_path} and saved to {output_csv_path}")

    except FileNotFoundError:
        print(f"Error: The file {input_csv_path} was not found.")
    except Exception as e:
        print(f"An error occurred while processing {input_csv_path}: {e}")

if __name__ == "__main__":
    # --- Configuration ---
    # Replace these with the actual paths to your CSV files
    csv_file_paths = [
        "../data/URLs/mercadolibre_products_reviews_links_combined.csv",  # Replace with your first file name
        "../data/URLs/mercadolibre_products_reviews_links_iphone_combined.csv"  # Replace with your second file name
    ]

    # --- Processing ---
    for file_path in csv_file_paths:
        if os.path.exists(file_path):
            # Create a new filename for the output
            # e.g., if input is "data.csv", output will be "data_classified.csv"
            base, ext = os.path.splitext(file_path)
            output_file_path = f"{base}_classified{ext}"
            
            classify_urls_in_csv(file_path, output_file_path)
        else:
            print(f"Skipping {file_path} as it does not exist. Please check the path.")

    print("\nProcessing complete.")


Successfully processed ../data/URLs/mercadolibre_products_reviews_links_combined.csv and saved to ../data/URLs/mercadolibre_products_reviews_links_combined_classified.csv
Successfully processed ../data/URLs/mercadolibre_products_reviews_links_iphone_combined.csv and saved to ../data/URLs/mercadolibre_products_reviews_links_iphone_combined_classified.csv

Processing complete.
Make sure to check the generated '_classified.csv' files for the results.


In [None]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv("../data/URLs/mercadolibre_products_reviews_links_iphone_combined_classified.csv")
print(df["url"].head(60))

# Price + Review finder

In [None]:
def cookie_handler_button(driver, cookie_accept_button_selector, wait_time=10):
    """
    Checks for and attempts to click a cookie consent button.
    Waits intelligently for the button to be clickable.

    Args:
        driver: The Selenium WebDriver instance.
        cookie_accept_button_selector (str): The CSS selector for the cookie button.
        wait_time (int): Maximum time in seconds to wait for the button.

    Returns:
        bool: True if the button was found and clicked, False otherwise.
    """
    try:
        print(f"Checking for cookie banner (waiting up to {wait_time}s for it to be CLICKABLE)...")
        
        # 1. WAIT UNTIL THE BUTTON IS CLICKABLE (This is the key fix!)
        wait = WebDriverWait(driver, wait_time)
        cookie_button = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, cookie_accept_button_selector))
        )
        
        print("Cookie button found and clickable. Attempting to click...")
        
        # 2. Try clicking (with fallback for interception)
        try:
            # Optional: Scroll into view, sometimes helps
            # driver.execute_script("arguments[0].scrollIntoView(true);", cookie_button)
            # time.sleep(0.5) 
            cookie_button.click()
            print("Cookie consent banner dismissed via standard click.")
            time.sleep(1) # Short pause for page to react
            return True
            
        except ElementClickInterceptedException:
            print("Standard click intercepted, trying JavaScript click...")
            driver.execute_script("arguments[0].click();", cookie_button)
            print("Cookie consent banner dismissed via JavaScript click.")
            time.sleep(1) # Short pause for page to react
            return True
            
        except Exception as e:
            print(f"An error occurred during click: {e}")
            return False

    except TimeoutException:
        # This is NORMAL if the banner isn't there (e.g., 2nd page load)
        print("No cookie consent banner found/clickable within the wait time.")
        return False
        
    except Exception as e:
        print(f"An unexpected error occurred while handling cookie banner: {e}")
        return False

def check_page_status(driver, wait_short):
    """
    Checks the current page for 429 (Button/JSON) or OK status.

    Args:
        driver: The Selenium WebDriver instance.
        wait_short: A short WebDriverWait instance (e.g., 5 seconds).

    Returns:
        str: "OK", "RATE_LIMITED_BUTTON", "RATE_LIMITED_JSON", 
             or "UNKNOWN_ERROR".
    """
    try:
        # --- Check for 429 Button Page ---
        reload_buttons = driver.find_elements(By.ID, "reload-button")
        if reload_buttons:
            print("Page Status: RATE_LIMITED_BUTTON")
            return "RATE_LIMITED_BUTTON"

        # --- Check for 429 JSON Page ---
        pre_elements = driver.find_elements(By.TAG_NAME, "pre")
        for pre in pre_elements:
            pre_text = pre.text.lower()
            if '"status":429' in pre_text and '"local_rate_limited"' in pre_text:
                print("Page Status: RATE_LIMITED_JSON")
                return "RATE_LIMITED_JSON"

        # --- Check for OK (Look for essential product page elements) ---
        # If neither 429 type was found, we wait for the OK element.
        wait_short.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.ui-pdp-title')))
        print("Page Status: OK")
        return "OK"

    except TimeoutException:
        # If we didn't find any 429 and *also* didn't find the 'OK' element,
        # it's an unknown error or an unexpected page type.
        print("Page Status: UNKNOWN_ERROR (Timeout finding OK element, no 429 found)")
        return "UNKNOWN_ERROR"
        
    except Exception as e:
        print(f"Page Status: UNKNOWN_ERROR during check ({e})")
        return "UNKNOWN_ERROR"
    
def find_product_details(driver, wait):
    """
    Finds the price, condition, rating, and number of reviews 
    of a product on its details page. Sets rating/reviews to 0 if not found.

    Args:
        driver: The Selenium WebDriver instance.
        wait: The WebDriverWait instance.

    Returns:
        dict: A dictionary with 'price', 'condition', 'rating', 'num_reviews', 
              or None if essential information (like price) cannot be found.
    """
    price_selector = 'span[data-testid="price-part"] span.andes-money-amount'
    condition_selector = 'span.ui-pdp-subtitle'
    rating_text_selector = 'a.ui-pdp-review__label span.andes-visually-hidden'
    num_reviews_selector = 'a.ui-pdp-review__label span.ui-pdp-review__amount'

    price = None
    condition = None
    rating = 0.0 
    num_reviews = 0

    # --- Find Price (No changes) ---
    try:
        print("Waiting for price element...")
        price_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, price_selector)))
        print("Price element found.")
        price = price_element.get_attribute("aria-label") or price_element.text
        price = price.strip() if price else None
    except TimeoutException:
        print("Could not find the price element. Cannot proceed.")
        return None
    except Exception as e:
        print(f"An error occurred while finding the price: {e}")
        return None
    if not price:
        print("Price element found but no price text extracted.")
        return None

    # --- Find Condition (No changes) ---
    try:
        print(f"Looking for condition element ({condition_selector})...")
        condition_elements = driver.find_elements(By.CSS_SELECTOR, condition_selector)
        if condition_elements:
            full_condition_text = condition_elements[0].text.strip()
            print(f"Full condition text found: {full_condition_text}")
            parts = full_condition_text.split('|')
            condition = parts[0].strip()
            print(f"Cleaned condition found: {condition}")
        else:
            print("Condition element not found, assuming 'Nuevo'.")
            condition = "Nuevo"
    except Exception as e:
        print(f"An error occurred while finding the condition: {e}")
        condition = "Unknown"

    # --- Find Rating and Number of Reviews (MODIFIED) ---
    try:
        print(f"Looking for rating text element ({rating_text_selector})...")
        rating_elements = driver.find_elements(By.CSS_SELECTOR, rating_text_selector)
        if rating_elements:
            full_rating_text = rating_elements[0].text.strip()
            print(f"Full rating text found: {full_rating_text}")
            match = re.search(r'(\d+\.?\d*)', full_rating_text)
            if match:
                try:
                    # Convert to float
                    rating = float(match.group(1))
                    print(f"Cleaned rating found: {rating}")
                except ValueError:
                    print(f"Could not convert rating '{match.group(1)}' to float, using 0.0")
                    rating = 0.0 # Set to 0 on conversion error
        else:
            print("Rating text element not found, using 0.0.")
            rating = 0.0 # Set to 0 if not found

        print(f"Looking for number of reviews element ({num_reviews_selector})...")
        num_reviews_elements = driver.find_elements(By.CSS_SELECTOR, num_reviews_selector)
        if num_reviews_elements:
            full_num_text = num_reviews_elements[0].text.strip()
            print(f"Full num reviews text found: {full_num_text}")
            # Try to find number inside parentheses first
            match = re.search(r'\((\d+)\)', full_num_text)
            if not match: # If not found in (), try finding any number
                match = re.search(r'(\d+)', full_num_text)
                
            if match:
                try:
                    # Convert to int
                    num_reviews = int(match.group(1))
                    print(f"Cleaned num reviews found: {num_reviews}")
                except ValueError:
                    print(f"Could not convert num_reviews '{match.group(1)}' to int, using 0")
                    num_reviews = 0 # Set to 0 on conversion error
            else:
                 print("Could not find number in num_reviews text, using 0.")
                 num_reviews = 0 # Set to 0 if no number found
        else:
            print("Number of reviews element not found, using 0.")
            num_reviews = 0 # Set to 0 if not found

    except Exception as e:
        print(f"An error occurred while finding rating/reviews: {e}")
        # Set to 0 on any broad error during this section
        rating = 0.0
        num_reviews = 0

    # --- UPDATED Return Statement ---
    print(f"Returning: Price={price}, Condition={condition}, Rating={rating}, NumReviews={num_reviews}")
    return {
        "price": price, 
        "condition": condition,
        "rating": rating,
        "num_reviews": num_reviews 
    }

def scrape_reviews(driver, wait):
    """
    Clicks 'See More', switches to iframe, scrolls, and scrapes reviews.

    Args:
        driver: The Selenium WebDriver instance.
        wait: The WebDriverWait instance.

    Returns:
        A list of dictionaries containing review data, or an empty list on failure.
    """
    reviews_data = []
    iframe_switched = False  # Flag to track if we are inside the iframe

    # --- Selectors (from your code) ---
    show_more_button_selector = "[data-testid='see-more']"
    iframe_selector = "[data-testid='ui-pdp-iframe-reviews']"
    review_elements_selector = "[data-testid='comment-component']"
    review_text_element_selector = "[data-testid='comment-content-component']"
    like_button_selector = "[data-testid='like-button']"
    header_element_selector = "div.ui-review-capability-comments__comment__header"
    date_selector_within_header = "span.ui-review-capability-comments__comment__date"
    rating_text_selector_within_header = "div.ui-review-capability-comments__comment__rating > p.andes-visually-hidden"
    star_svg_selector_within_header = "div.ui-review-capability-comments__comment__rating svg.ui-review-capability-comments__comment__rating_star"

    try:
        # --- Click the "Mostrar todas las opiniones" button ---
        try:
            print("\nAttempting to find and click 'Mostrar todas las opiniones' button...")
            show_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, show_more_button_selector)))
            
            # Scroll button into view before clicking (can help prevent clicks being intercepted)
            driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
            time.sleep(0.5) # small pause
            show_more_button.click()
            print("'Mostrar todas las opiniones' button clicked.")
            time.sleep(7) # Keep your wait time for iframe load
        except Exception as e:
            # Don't quit, just log it. Maybe reviews are already loaded or layout changed.
            print(f"INFO: Could not find or click the 'Mostrar todas las opiniones' button: {e}. Trying to proceed anyway...")

        # --- Switch to the Reviews Iframe ---
        print(f"Attempting to switch to iframe with selector: {iframe_selector}")
        wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, iframe_selector)))
        iframe_switched = True # Set flag AFTER successful switch
        print("Switched to reviews iframe.")

        # --- Scroll within Iframe ---
        print("Scrolling down to load more reviews within the iframe...")
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempts = 0
        while scroll_attempts < 15: # Your limit
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(5, 10)) # Your random sleep
            new_height = driver.execute_script("return document.body.scrollHeight")
            print(f"Scroll attempt {scroll_attempts + 1}, new height: {new_height}, last height: {last_height}")
            if new_height == last_height and new_height > 0 : # Added > 0 check
                print("Scroll height did not change, assuming end of content.")
                break
            last_height = new_height
            scroll_attempts += 1
        print("Finished scrolling within the iframe.")

        # --- Locate Review Elements ---
        print(f"Waiting for review elements within the iframe ({review_elements_selector})...")
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, review_elements_selector)))
        review_elements = driver.find_elements(By.CSS_SELECTOR, review_elements_selector)
        print(f"Found {len(review_elements)} potential user review elements inside iframe.")

        # --- Extract Data ---
        print("Starting data extraction...")
        for index, review_element in enumerate(review_elements):
            review_text, review_date, star_rating = "N/A", "N/A", "N/A"
            like_count = 0

            try:
                review_text = review_element.find_element(By.CSS_SELECTOR, review_text_element_selector).text.strip()
            except NoSuchElementException: pass

            try:
                button_text = review_element.find_element(By.CSS_SELECTOR, like_button_selector).text.strip()
                match = re.search(r'\d+', button_text)
                like_count = int(match.group(0)) if match else 0
            except NoSuchElementException: pass

            try:
                header_element = review_element.find_element(By.CSS_SELECTOR, header_element_selector)
                try:
                    review_date = header_element.find_element(By.CSS_SELECTOR, date_selector_within_header).text.strip()
                except NoSuchElementException: pass
                try:
                    rating_text = header_element.find_element(By.CSS_SELECTOR, rating_text_selector_within_header).text.strip()
                    rating_match = re.search(r'(\d+)', rating_text)
                    star_rating = int(rating_match.group(1)) if rating_match else "N/A"
                except NoSuchElementException:
                    try: # Fallback to counting stars
                        star_svgs = header_element.find_elements(By.CSS_SELECTOR, star_svg_selector_within_header)
                        star_rating = len(star_svgs) if star_svgs else "N/A"
                    except NoSuchElementException: pass
            except NoSuchElementException: pass

            reviews_data.append({
                "index": index + 1, # Make it 1-based
                "review": review_text,
                "likes": like_count,
                "stars": star_rating,
                "date": review_date
            })
        print(f"Data extraction complete. Found {len(reviews_data)} reviews.")

    except Exception as e:
        # Catch any broad exception during the process
        print(f"An error occurred during review scraping: {e}")
        # Return whatever data we *might* have collected before the error
        return reviews_data

    finally:
        # --- CRITICAL: Always switch back to default content ---
        if iframe_switched:
            print("Switching back to default content.")
            driver.switch_to.default_content()

    return reviews_data

def get_urls_to_scrape(csv_filepaths):
    """
    Reads one or more CSV files, filters for rows where 'JM' is 0,
    and returns a list of dictionaries with 'title' and 'url'.

    Args:
        csv_filepaths (list or str): A list of paths or a single path to your CSV file(s).

    Returns:
        list: A list of dictionaries [{'title': ..., 'url': ...}], or an empty list on error.
    """
    all_products_to_scrape = []

    # Ensure we always work with a list, even if a single string is passed
    if not isinstance(csv_filepaths, list):
        csv_filepaths = [csv_filepaths]

    for csv_filepath in csv_filepaths:
        try:
            df = pd.read_csv(csv_filepath)
            print(f"Successfully read {csv_filepath}. Found {len(df)} rows.")

            # --- Input Validation (Now includes 'title') ---
            if 'url' not in df.columns or 'JM' not in df.columns or 'title' not in df.columns:
                print(f"Error: CSV '{csv_filepath}' must contain 'url', 'JM', and 'title' columns. Skipping this file.")
                continue # Skip to the next file

            # --- Filtering ---
            df['JM'] = pd.to_numeric(df['JM'], errors='coerce')
            filtered_df = df[df['JM'] == 0].dropna(subset=['JM'])

            # --- MODIFICATION: Get 'title' and 'url' as list of dicts ---
            products = filtered_df[['title', 'url']].to_dict('records')
            print(f"Found {len(products)} products with JM = 0 in this file.")
            all_products_to_scrape.extend(products) # Add found products to the main list

        except FileNotFoundError:
            print(f"Error: CSV file not found at {csv_filepath}")
            continue # Skip to the next file
        except Exception as e:
            print(f"An error occurred while reading or filtering '{csv_filepath}': {e}")
            continue # Skip to the next file

    print(f"\nFound a total of {len(all_products_to_scrape)} products to scrape across all files.")
    return all_products_to_scrape # Return the combined list

def save_to_csv(scraped_data, output_filename="mercado_libre_results.csv"):
    """
    Flattens the scraped data and saves it to a CSV file.
    Uses 0 as the default for rating and num_reviews.
    """
    flattened_list = []
    print("\nPreparing data for saving...")

    output_directory = os.path.join("data", "results") 
    full_output_path = os.path.join(output_directory, output_filename)

    for product in scraped_data:
        url = product.get('url', 'N/A')
        title = product.get('title', 'N/A') 
        price = product.get('price', 'N/A')
        condition = product.get('condition', 'N/A')
        rating = product.get('rating', 0.0) 
        num_reviews = product.get('num_reviews', 0) 
        reviews = product.get('reviews', [])

        if reviews:
            for review in reviews:
                flattened_list.append({
                    'url': url,
                    'title': title, 
                    'price': price,
                    'condition': condition,
                    'rating': rating, # Add rating
                    'num_reviews': num_reviews, # Add num reviews
                    'review_index': review.get('index', 'N/A'),
                    'review_text': review.get('review', 'N/A'),
                    'review_likes': review.get('likes', 'N/A'),
                    'review_stars': review.get('stars', 'N/A'),
                    'review_date': review.get('date', 'N/A')
                })
        else:
            flattened_list.append({
                'url': url,
                'title': title, 
                'price': price,
                'condition': condition,
                'rating': rating, # Add rating
                'num_reviews': num_reviews, # Add num reviews
                'review_index': 'N/A',
                'review_text': 'N/A',
                'review_likes': 'N/A',
                'review_stars': 'N/A',
                'review_date': 'N/A'
            })

    if not flattened_list:
        print("No data to save.")
        return

    df = pd.DataFrame(flattened_list)

    try:
        os.makedirs(output_directory, exist_ok=True)
        print(f"Ensured directory exists: {output_directory}")
        df.to_csv(full_output_path, index=False, encoding='utf-8-sig')
        print(f"Successfully saved {len(df)} rows to {full_output_path}")
    except Exception as e:
        print(f"Error saving data to CSV at {full_output_path}: {e}")

if __name__ == "__main__":
    # --- Setup ---
    try:
        options = webdriver.ChromeOptions()
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        print("Chrome WebDriver initialized.")
        wait = WebDriverWait(driver, 20) 
        wait_short = WebDriverWait(driver, 5) # For status check
        all_scraped_data = []
    except Exception as e:
        print(f"Error initializing Chrome WebDriver: {e}")
        exit()

    # --- Get Products ---
    csv_files = ["../data/URLs/classified/mercadolibre_products_reviews_links_iphone_combined_classified.csv"] # Adjust path
    products_to_process = get_urls_to_scrape(csv_files)
    cookie_accept_button_selector = "button[data-testid='action:understood-button']"

    # --- Main Loop Config ---
    rate_limit_pause_seconds = 120  # 2 minutes
    max_retries = 1 # We will try once, then wait and try a second time.

    # --- Main Loop ---
    for product_data in products_to_process:
        url = product_data['url']
        title = product_data['title']
        attempts = 0
        status = "INIT" # *** MOVED status OUTSIDE the while loop ***
        
        print(f"\n=======================================")
        print(f"Processing: {title} | URL: {url}")
        print(f"=======================================")

        while attempts <= max_retries:
            attempts += 1
            print(f"--- Attempt {attempts} ---")
            
            try:
                # Load page (unless it's a retry after clicking Ricarica)
                # *** Uses 'status' which is now preserved across 'continue' ***
                if not (attempts > 1 and status == "RATE_LIMITED_BUTTON"):
                    driver.get(url)
                    time.sleep(random.uniform(2, 4))
                    cookie_handler_button(driver, cookie_accept_button_selector, wait_time=10)

                # Check the status
                status = check_page_status(driver, wait_short)

                # Handle Rate Limiting
                if status == "RATE_LIMITED_BUTTON":
                    if attempts > max_retries: break # Already retried, break loop
                    print(f"--- RATE LIMITED (Button). Waiting {rate_limit_pause_seconds}s and clicking 'Ricarica'...")
                    time.sleep(rate_limit_pause_seconds)
                    try:
                       driver.find_element(By.ID, "reload-button").click()
                       time.sleep(5) # Wait for click reload
                       continue # Loop for next attempt (status is preserved)
                    except Exception as click_err:
                       print(f"Error clicking reload button: {click_err}")
                       status = "CRITICAL_ERROR" 
                       break # Critical error, stop trying

                elif status == "RATE_LIMITED_JSON":
                    if attempts > max_retries: break # Already retried, break loop
                    print(f"--- RATE LIMITED (JSON). Waiting {rate_limit_pause_seconds}s and reloading URL...")
                    time.sleep(rate_limit_pause_seconds)
                    continue # Loop for next attempt (will call driver.get)

                # If not rate-limited or if retries exhausted, break the while loop
                break

            except Exception as e:
                print(f"--- CRITICAL ERROR during attempt {attempts} for {url}: {e} ---")
                status = "CRITICAL_ERROR"
                break # Exit the while loop

        # --- Process the FINAL status after all attempts ---
        if status == "OK":
            product_info = find_product_details(driver, wait)
            if product_info:
                reviews = scrape_reviews(driver, wait)
                product_info['url'] = url
                product_info['title'] = title
                product_info['reviews'] = reviews
                all_scraped_data.append(product_info)
                print(f"--- SUCCESS: Scraped {url} ---")
            else:
                print(f"--- FAILED (Layout?): Could not scrape details for OK page {url} ---")
                all_scraped_data.append({"url": url, "title": title, "price": "FAILED_LAYOUT", "condition": "FAILED_LAYOUT", "reviews": []})
        
        elif status.startswith("RATE_LIMITED"):
            print(f"--- FAILED (Rate Limited After Retries): Flagging {url} ---")
            all_scraped_data.append({"url": url, "title": title, "price": "RATE_LIMITED", "condition": "RATE_LIMITED", "reviews": []})

        # *** REMOVED the "NOT_FOUND" block ***

        elif status == "UNKNOWN_ERROR":
            print(f"--- FAILED (Unknown Error): Skipping {url} ---")
            all_scraped_data.append({"url": url, "title": title, "price": "UNKNOWN_ERROR", "condition": "UNKNOWN_ERROR", "reviews": []})

        elif status == "CRITICAL_ERROR":
             print(f"--- FAILED (Critical Error): Skipping {url} ---")
             all_scraped_data.append({"url": url, "title": title, "price": "CRITICAL_ERROR", "condition": "CRITICAL_ERROR", "reviews": []})


        # Add a delay *before* the *next* URL (unless we hit a 429 and waited)
        if not status.startswith("RATE_LIMITED"):
             delay = random.uniform(8, 20) 
             print(f"Waiting for {delay:.1f} seconds before next URL...")
             time.sleep(delay)

    # --- Results ---
    save_to_csv(all_scraped_data) 

    driver.quit()
    print("Driver closed.")

#url = "https://www.mercadolibre.com.mx/apple-iphone-15-plus-128gb-sim-fisica/up/MLMU3179137272#reviews"

Chrome WebDriver initialized.
Successfully read ../data/URLs/classified/mercadolibre_products_reviews_links_iphone_combined_classified.csv. Found 1196 rows.
Found 529 products with JM = 0 in this file.

Found a total of 529 products to scrape across all files.

Processing: Aiek A8 Pro-a 3.0-inch Mini 3g Red Phone 2+16g Android 9.0 Niños Teléfono Móvil Regalo De Cumpleaños Dar Pegatinas | URL: https://click1.mercadolibre.com.mx/mclics/clicks/external/MLM/count#reviews
--- Attempt 1 ---
Checking for cookie banner (waiting up to 10s for it to be CLICKABLE)...
Cookie button found and clickable. Attempting to click...
Cookie consent banner dismissed via standard click.
Page Status: UNKNOWN_ERROR (Timeout finding OK element, no 429 found)
--- FAILED (Unknown Error): Skipping https://click1.mercadolibre.com.mx/mclics/clicks/external/MLM/count#reviews ---
Waiting for 14.0 seconds before next URL...

Processing: Teléfono Para Ancianos Flip 4g De Doble Pantalla, Teclado Grande, Volumen Alto, Sos

KeyboardInterrupt: 

# Product Scraping

In [None]:
data =[]

# --- Setup ---
try:
    service = Service(ChromeDriverManager().install())
    print("Chrome WebDriver installed and initialized using WebDriver Manager.")
except Exception as e:
    print(f"Error initializing Chrome WebDriver with WebDriver Manager: {e}")
    print("Please ensure Chrome is installed and webdriver-manager is correctly installed (`pip install webdriver-manager`).")
    exit()

options = webdriver.ChromeOptions()
# options.add_argument('--headless') # Headless might make iframe interaction tricky, keep it off for now
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=service, options=options)

url_base = "https://www.mercadolibre.com.mx/apple-iphone-13-128-gb-blanco-estelar-distribuidor-autorizado/p/MLM1018500855#reviews"


driver.get(url_base)

wait = WebDriverWait(driver, 20) # Increased wait time again

# --- Handle Cookie Consent Banner ---
cookie_accept_button_selector = "[data-testid='action:understood-button']"

try:
    print("Checking for and attempting to dismiss cookie consent banner...")
    time.sleep(3)
    cookie_buttons = driver.find_elements(By.CSS_SELECTOR, cookie_accept_button_selector)
    if cookie_buttons:
        cookie_buttons[0].click()
        print("Cookie consent banner dismissed.")
        time.sleep(3)
    else:
        print("No cookie consent banner found or dismissed.")
except Exception as e:
    print(f"Could not dismiss cookie consent banner: {e}")


# --- Click the "Mostrar todas las opiniones" button ---
show_more_button_selector = "[data-testid='see-more']"
try:
    print("Attempting to find and click 'Mostrar todas las opiniones' button...")
    show_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, show_more_button_selector)))
    show_more_button.click()
    print("'Mostrar todas las opiniones' button clicked.")
    # Give time for the iframe to load after the button click
    time.sleep(7)
except Exception as e:
    print(f"Could not find or click the 'Mostrar todas las opiniones' button: {e}")
    driver.quit() # Exit if we can't even get to the reviews

# --- Switch to the Reviews Iframe ---
iframe_selector = "[data-testid='ui-pdp-iframe-reviews']"
try:
    print(f"Attempting to switch to iframe with selector: {iframe_selector}")
    # Wait for the iframe to be present
    wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, iframe_selector)))
    print("Switched to reviews iframe.")
    # Now we are inside the iframe's document context
except Exception as e:
    print(f"Could not switch to reviews iframe using selector '{iframe_selector}': {e}")
    driver.quit() # Exit if we can't access the reviews iframe


# --- IMPORTANT: Scroll to load reviews within the Iframe ---
# We need to scroll the document *within the iframe*.
try:
    print("Scrolling down to load more reviews within the iframe...")
    # Execute script on the iframe's document body
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0
    while scroll_attempts < 15:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        current_time_scroll = current_sleep_duration = random.uniform(5, 10)
        time.sleep(current_time_scroll)
        new_height = driver.execute_script("return document.body.scrollHeight")

        print(f"Scroll attempt {scroll_attempts + 1}, current height: {driver.execute_script('return window.scrollY')}, new height: {new_height}, last height: {last_height}") # More detailed scroll debug

        if new_height == last_height:
            print("Scroll height did not change, assuming end of content or scrolling failed.")
            # You might need to find a specific scrollable element inside the iframe
            # if document.body scrolling isn't working.
            break
        last_height = new_height
        scroll_attempts += 1
    print("Finished scrolling within the iframe.")

except Exception as e:
    print(f"Could not perform scrolling within the iframe: {e}")


# --- Locate Review Elements (NOW INSIDE THE IFRAME) ---
review_elements_selector = "[data-testid='comment-component']" 
review_text_element_selector = "[data-testid='comment-content-component']" #
like_button_selector = "[data-testid='like-button']" 

# --- Define the selector for the header component within each review ---
header_element_selector = "div.ui-review-capability-comments__comment__header"
date_selector_within_header = "span.ui-review-capability-comments__comment__date"
rating_text_selector_within_header = "div.ui-review-capability-comments__comment__rating > p.andes-visually-hidden"



try:
    print(f"Waiting for review elements within the iframe with selector: {review_elements_selector}")
    # Wait for review elements to be present inside the iframe
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, review_elements_selector)))
    print("User review elements selector found inside iframe. Attempting to find all matching elements.")

    # Find ALL elements matching the selector *within the current iframe context*
    review_elements = driver.find_elements(By.CSS_SELECTOR, review_elements_selector)
    print(f"Found {len(review_elements)} potential user review elements inside iframe AFTER scrolling.")

except Exception as e:
    print(f"Could not find user review elements inside iframe using selector '{review_elements_selector}' after scrolling: {e}")
    review_elements = []


# --- Extract Data from Each Review Element (INSIDE IFRAME) ---
reviews_data = []

if review_elements:
    print("Starting data extraction from elements inside iframe...")
    for index, review_element in enumerate(review_elements):
        review_text = "N/A"
        like_count = 0
        review_date = "N/A" # Initialize new variable
        star_rating = "N/A" # Initialize new variable

        # Find review text element *within* the current review_element (still inside iframe)
        try:
            review_text_element = review_element.find_element(By.CSS_SELECTOR, review_text_element_selector)
            review_text = review_text_element.text.strip()
            # print(f"   Extracted text for review {index + 1}") # Uncomment for detailed debugging
        except Exception as e:
            # print(f"   Could not find review text for review {index + 1} inside iframe using selector '{review_text_element_selector}': {e}") # Uncomment for detailed debugging
            pass # Element not found for this specific review

        # Find the like BUTTON element *within* the current review_element (still inside iframe)
        try:
            # *** USING THE (POTENTIALLY INCORRECT) SELECTOR HERE - VERIFY INSIDE IFRAME ***
            like_button_element = review_element.find_element(By.CSS_SELECTOR, like_button_selector)
            button_text = like_button_element.text.strip()
            match = re.search(r'\d+', button_text)
            if match:
                like_count = int(match.group(0))
            else:
                like_count = 0
            # print(f"   Extracted likes for review {index + 1}: {like_count}") # Uncomment for detailed debugging
        except Exception as e:
            # print(f"   Could not find like button for review {index + 1} inside iframe using selector '{like_button_selector}': {e}") # Uncomment for detailed debugging
            like_count = 0
            pass # Element not found for this specific review
        
# --- NEW APPROACH: First find the header element ---
        try:
            header_element = review_element.find_element(By.CSS_SELECTOR, header_element_selector)
            # print(f"Review {index+1}: Header element found.") # For debugging

            # --- Extract Date from within the header ---
            try:
                date_element = header_element.find_element(By.CSS_SELECTOR, date_selector_within_header)
                review_date = date_element.text.strip()
            except Exception as e:
                # print(f"Review {index+1}: Could not find date within header: {e}")
                pass

            # --- Extract Star Rating from within the header ---
            try:
                rating_text_element = header_element.find_element(By.CSS_SELECTOR, rating_text_selector_within_header)
                full_rating_text = rating_text_element.text.strip()
                rating_match = re.search(r'(\d+)', full_rating_text) # Extracts the first number
                if rating_match:
                    star_rating = int(rating_match.group(1))
                else:
                    # Fallback: If text parsing fails, try counting star SVGs within the header
                    try:
                        star_svg_selector_within_header = "div.ui-review-capability-comments__comment__rating svg.ui-review-capability-comments__comment__rating_star"
                        star_svgs = header_element.find_elements(By.CSS_SELECTOR, star_svg_selector_within_header)
                        if len(star_svgs) > 0 : # Ensure we found some before assigning
                           star_rating = len(star_svgs)
                        # print(f"Review {index+1}: Found {star_rating} star SVGs as fallback.") # For debugging
                    except Exception as e_svg:
                        # print(f"Review {index+1}: Could not count star SVGs: {e_svg}")
                        pass # Keep previous N/A or parsed value if SVG counting fails
            except Exception as e:
                # print(f"Review {index+1}: Could not find rating text within header: {e}")
                # Fallback to SVG count if rating_text_element itself not found
                try:
                    star_svg_selector_within_header = "div.ui-review-capability-comments__comment__rating svg.ui-review-capability-comments__comment__rating_star"
                    star_svgs = header_element.find_elements(By.CSS_SELECTOR, star_svg_selector_within_header)
                    if len(star_svgs) > 0 :
                        star_rating = len(star_svgs)
                    # print(f"Review {index+1}: Found {star_rating} star SVGs as primary fallback.") # For debugging
                except Exception as e_svg_primary_fallback:
                    # print(f"Review {index+1}: Could not count star SVGs on primary fallback: {e_svg_primary_fallback}")
                    pass


        except Exception as e:
            # print(f"Review {index+1}: Could not find header element: {e}")
            pass # If header is not found, date and rating will remain "N/A"


        reviews_data.append({
            "index": index,
            "review": review_text,
            "likes": like_count,
            "stars": star_rating,
            "date": review_date
        })
    print("Data extraction complete.")
else:
    print("No user review elements found inside iframe to extract data from after attempting to scroll.")


# --- Switch back to the Default Content ---
try:
    driver.switch_to.default_content()
    print("Switched back to default content.")
except Exception as e:
    print(f"Could not switch back to default content: {e}")


# --- Process or Store the Data ---
print("\n--- Extracted User Reviews ---")
if reviews_data:
    for i, item in enumerate(reviews_data):
        print(f"Review {i + 1}:")
        print(f"   Text: {item['review']}")
        print(f"   Likes: {item['likes']}")
        print(f"   Stars: {item['stars']}")
        print(f"   Likes: {item['date']}")
        print("-" * 20)
else:
    print("No data was extracted.")

df = pd.DataFrame(reviews_data)
df.to_excel('review_data.xlsx', index=False) # Saving the data in an excel sheet

# --- Close the Browser ---
print("Closing browser.")
driver.quit()

Optional - random user agent

In [None]:
#add user to avoid 429 error
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
]

options = webdriver.ChromeOptions()
options.add_argument(f"user-agent={random.choice(user_agents)}")
