In [2]:
pip install --user webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver_manager
Successfully installed webdriver_manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
import time
import re
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Scraping link

In [None]:
data =[]
extracted_data = []

def cookie_handler_button(driver, cookie_accept_button_selector):
    try:
        print("Checking for and attempting to dismiss cookie consent banner...")
        time.sleep(3)
        cookie_buttons = driver.find_elements(By.CSS_SELECTOR, cookie_accept_button_selector)
        if cookie_buttons:
            cookie_buttons[0].click()
            print("Cookie consent banner dismissed.")
            time.sleep(3)
        else:
            print("No cookie consent banner found or dismissed.")
    except Exception as e:
        print(f"Could not dismiss cookie consent banner: {e}")
    return None


def product_finder(driver, url, max_pages_to_scrape=5):
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, product_container_selector)))
        print("User review elements selector found. Attempting to find all matching elements.")
        product_elements_list = driver.find_elements(By.CSS_SELECTOR, single_product_selector)
        driver.find_elements(By.CSS_SELECTOR, single_product_selector)
        
        if product_elements_list:
            print(f"Found {len(product_elements_list)} products.")
            for index, product_element in enumerate(product_elements_list):
                print(f"\n--- Product {index + 1} ---")
                # print(product_element.get_attribute('outerHTML')) # You can comment this out if you don't need the full HTML printed anymore

                product_url = "Not found"
                product_title = "Not found"

                try:
                    # Find the <a> tag with class 'poly-component__title' WITHIN the current product_element
                    # This is a direct child of h3.poly-component__title-wrapper,
                    # which is a child of div.poly-card__content
                    # A more specific selector could be 'div.poly-card__content h3.poly-component__title-wrapper a.poly-component__title'
                    # But 'a.poly-component__title' is often sufficient if it's unique enough within the product_element.
                    link_element = product_element.find_element(By.CSS_SELECTOR, 'a.poly-component__title')
                    product_url = link_element.get_attribute('href')
                    product_title = link_element.text # Get the visible text of the link (usually the product title)

                    print(f"Title: {product_title}")
                    print(f"URL: {product_url}")

                    # Optional: Store the data
                    extracted_data.append({'title': product_title, 'url': product_url})

                except NoSuchElementException:
                    print("Could not find the title/link element for this product.")
                except Exception as e_inner:
                    print(f"An error occurred while extracting details for this product: {e_inner}")
        else:
            print("No products found matching the selector.")
            
    except Exception as e:
        print(f"Error in product_finder: {e}")
        
    return None

if __name__ == "__main__":
    # --- Setup ---
    try:
        service = Service(ChromeDriverManager().install())
        print("Chrome WebDriver installed and initialized using WebDriver Manager.")
    except Exception as e:
        print(f"Error initializing Chrome WebDriver with WebDriver Manager: {e}")
        print("Please ensure Chrome is installed and webdriver-manager is correctly installed (`pip install webdriver-manager`).")
        exit()
        
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 20) # Increased wait time again
    url = "https://listado.mercadolibre.com.mx/distribuidor-autorizado_Desde_49_NoIndex_True?sb=all_mercadolibre"
    driver.get(url)
    
    #Selectors
    cookie_accept_button_selector = "[data-testid='action:understood-button']"
    product_container_selector = "ol.ui-search-layout"
    single_product_selector = 'li.ui-search-layout__item'
    next_page_button_selector = "a.andes-pagination__link[title='Siguiente']"
    
    cookie_handler_button(driver, cookie_accept_button_selector)
    
        # --- PAGINATION LOGIC ---
    max_pages_to_scrape_total = 3 
    current_page_count = 0 

    while current_page_count < max_pages_to_scrape_total:
        current_page_count += 1
        print(f"\n--- Scraping Page {current_page_count} ---")

        if current_page_count > 1: # If it's not the first page, we'd have navigated via button
            # Optionally, add a small wait here to ensure page elements are settled
            time.sleep(3) # Adjust as necessary
            # Re-check for cookie banner just in case it reappears on navigation, though unlikely
            # cookie_handler_button(driver, cookie_accept_button_selector)
        product_finder(driver, driver.current_url) # Pass current URL or None
        
        if current_page_count >= max_pages_to_scrape_total:
            print(f"Reached maximum of {max_pages_to_scrape_total} pages to scrape.")
            break

        # --- Navigate to the Next Page ---
        print(f"\nAttempting to navigate from page {current_page_count} to next page...")
        try:
            print("Scrolling to bottom of page to find 'Next' button...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2.5) # Allow time for lazy loading or button to become interactable

            print(f"Looking for next page button with CSS selector: '{next_page_button_selector}'")
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, next_page_button_selector)))
            
            # Scroll the element into view for robustness, then click
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'auto', block: 'center', inline: 'center'});", next_button)
            time.sleep(1) # Brief pause

            print("Clicking next page button...")
            next_button.click()
            
            print(f"Successfully clicked 'Next'. Waiting for page {current_page_count + 1} to load...")
            # Wait for new page to load. A robust wait would be for a specific element
            # on the new page or staleness of an old one.
            # For now, a time.sleep or waiting for the product container again within product_finder.
            time.sleep(5) # Adjust as needed. Your product_finder already waits for product_container.

        except TimeoutException:
            print(f"Next page button not found or not clickable after waiting (selector: '{next_page_button_selector}'). Ending pagination.")
            break 
        except NoSuchElementException: # Should be caught by TimeoutException from wait.until
            print(f"Next page button element not found (selector: '{next_page_button_selector}'). Ending pagination.")
            break
        except Exception as e:
            print(f"An error occurred while trying to navigate to the next page: {e}")
            break
    
    # --- END OF PAGINATION LOGIC ---

    # After the loop, print all collected data
    print("\n--- All Extracted Data ---")
    if extracted_data:
        for item_index, item_data in enumerate(extracted_data):
            # Add page number to the item if your product_finder doesn't do it
            # (Assuming product_finder appends dicts with 'title' and 'url')
            print(f"Item {item_index + 1}: {item_data}")
    
    # --- Close the Browser ---
    print("Closing browser.")
    driver.quit()


## Refine extracted URLs

In [None]:
print("\nProcessing URLs")

for item in extracted_data:
    original_url = item.get('url', '') # Get the URL, default to empty string if not found
    if not original_url: # Skip if URL is empty or missing
        continue

    modified_url = original_url # Assume no change initially
    processed_action = "unchanged" # To describe what happened

    # 1. Check for '?' first
    query_start_index = original_url.find('?')
    if query_start_index != -1:
        # If '?' is found, take the part of the URL before '?' and append '#reviews'
        base_url = original_url[:query_start_index]
        modified_url = base_url + '#reviews'
        processed_action = "processed '?'"
    else:
        # 2. If '?' was NOT found, then check for '#polycard'
        polycard_start_index = original_url.find('#polycard')
        if polycard_start_index != -1:
            # If '#polycard' is found, take the part of the URL before '#polycard' and append '#reviews'
            base_url = original_url[:polycard_start_index]
            modified_url = base_url + '#reviews'
            processed_action = "processed '#polycard'"
            
    # Update the item's URL if it was modified
    if original_url != modified_url:
        item['url'] = modified_url
        print(f"  Original: {original_url}")
        print(f"  Modified: {item['url']} (Action: {processed_action})")
    else:
        print(f"  URL unchanged: {original_url}")

print("URL processing complete.")

print("\n--- All Extracted Data (with refined URL modifications) ---")
if extracted_data:
    for item_index, item_data in enumerate(extracted_data):
        print(f"Item {item_index + 1}: {item_data}")

    #Optional: Save to CSV
    df = pd.DataFrame(extracted_data)
    try:
        df.to_csv('mercadolibre_products_reviews_links_v2.csv', index=False, encoding='utf-8')
        print("\nData with refined URLs saved to mercadolibre_products_reviews_links_v2.csv")
    except Exception as e:
        print(f"Error saving data to CSV: {e}")
    else:
        print("No data was extracted.")

# Product Scraping

In [2]:
data =[]

# --- Setup ---
try:
    service = Service(ChromeDriverManager().install())
    print("Chrome WebDriver installed and initialized using WebDriver Manager.")
except Exception as e:
    print(f"Error initializing Chrome WebDriver with WebDriver Manager: {e}")
    print("Please ensure Chrome is installed and webdriver-manager is correctly installed (`pip install webdriver-manager`).")
    exit()

options = webdriver.ChromeOptions()
# options.add_argument('--headless') # Headless might make iframe interaction tricky, keep it off for now
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=service, options=options)

url_base = "https://www.mercadolibre.com.mx/apple-iphone-13-128-gb-blanco-estelar-distribuidor-autorizado/p/MLM1018500855#reviews"


driver.get(url_base)

wait = WebDriverWait(driver, 20) # Increased wait time again

# --- Handle Cookie Consent Banner ---
cookie_accept_button_selector = "[data-testid='action:understood-button']"

try:
    print("Checking for and attempting to dismiss cookie consent banner...")
    time.sleep(3)
    cookie_buttons = driver.find_elements(By.CSS_SELECTOR, cookie_accept_button_selector)
    if cookie_buttons:
        cookie_buttons[0].click()
        print("Cookie consent banner dismissed.")
        time.sleep(3)
    else:
        print("No cookie consent banner found or dismissed.")
except Exception as e:
    print(f"Could not dismiss cookie consent banner: {e}")


# --- Click the "Mostrar todas las opiniones" button ---
show_more_button_selector = "[data-testid='see-more']"
try:
    print("Attempting to find and click 'Mostrar todas las opiniones' button...")
    show_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, show_more_button_selector)))
    show_more_button.click()
    print("'Mostrar todas las opiniones' button clicked.")
    # Give time for the iframe to load after the button click
    time.sleep(7)
except Exception as e:
    print(f"Could not find or click the 'Mostrar todas las opiniones' button: {e}")
    driver.quit() # Exit if we can't even get to the reviews

# --- Switch to the Reviews Iframe ---
iframe_selector = "[data-testid='ui-pdp-iframe-reviews']"
try:
    print(f"Attempting to switch to iframe with selector: {iframe_selector}")
    # Wait for the iframe to be present
    wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, iframe_selector)))
    print("Switched to reviews iframe.")
    # Now we are inside the iframe's document context
except Exception as e:
    print(f"Could not switch to reviews iframe using selector '{iframe_selector}': {e}")
    driver.quit() # Exit if we can't access the reviews iframe


# --- IMPORTANT: Scroll to load reviews within the Iframe ---
# We need to scroll the document *within the iframe*.
try:
    print("Scrolling down to load more reviews within the iframe...")
    # Execute script on the iframe's document body
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0
    while scroll_attempts < 15:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        current_time_scroll = current_sleep_duration = random.uniform(5, 10)
        time.sleep(current_time_scroll)
        new_height = driver.execute_script("return document.body.scrollHeight")

        print(f"Scroll attempt {scroll_attempts + 1}, current height: {driver.execute_script('return window.scrollY')}, new height: {new_height}, last height: {last_height}") # More detailed scroll debug

        if new_height == last_height:
            print("Scroll height did not change, assuming end of content or scrolling failed.")
            # You might need to find a specific scrollable element inside the iframe
            # if document.body scrolling isn't working.
            break
        last_height = new_height
        scroll_attempts += 1
    print("Finished scrolling within the iframe.")

except Exception as e:
    print(f"Could not perform scrolling within the iframe: {e}")


# --- Locate Review Elements (NOW INSIDE THE IFRAME) ---
review_elements_selector = "[data-testid='comment-component']" 
review_text_element_selector = "[data-testid='comment-content-component']" #
like_button_selector = "[data-testid='like-button']" 

# --- Define the selector for the header component within each review ---
header_element_selector = "div.ui-review-capability-comments__comment__header"
date_selector_within_header = "span.ui-review-capability-comments__comment__date"
rating_text_selector_within_header = "div.ui-review-capability-comments__comment__rating > p.andes-visually-hidden"



try:
    print(f"Waiting for review elements within the iframe with selector: {review_elements_selector}")
    # Wait for review elements to be present inside the iframe
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, review_elements_selector)))
    print("User review elements selector found inside iframe. Attempting to find all matching elements.")

    # Find ALL elements matching the selector *within the current iframe context*
    review_elements = driver.find_elements(By.CSS_SELECTOR, review_elements_selector)
    print(f"Found {len(review_elements)} potential user review elements inside iframe AFTER scrolling.")

except Exception as e:
    print(f"Could not find user review elements inside iframe using selector '{review_elements_selector}' after scrolling: {e}")
    review_elements = []


# --- Extract Data from Each Review Element (INSIDE IFRAME) ---
reviews_data = []

if review_elements:
    print("Starting data extraction from elements inside iframe...")
    for index, review_element in enumerate(review_elements):
        review_text = "N/A"
        like_count = 0
        review_date = "N/A" # Initialize new variable
        star_rating = "N/A" # Initialize new variable

        # Find review text element *within* the current review_element (still inside iframe)
        try:
            review_text_element = review_element.find_element(By.CSS_SELECTOR, review_text_element_selector)
            review_text = review_text_element.text.strip()
            # print(f"   Extracted text for review {index + 1}") # Uncomment for detailed debugging
        except Exception as e:
            # print(f"   Could not find review text for review {index + 1} inside iframe using selector '{review_text_element_selector}': {e}") # Uncomment for detailed debugging
            pass # Element not found for this specific review

        # Find the like BUTTON element *within* the current review_element (still inside iframe)
        try:
            # *** USING THE (POTENTIALLY INCORRECT) SELECTOR HERE - VERIFY INSIDE IFRAME ***
            like_button_element = review_element.find_element(By.CSS_SELECTOR, like_button_selector)
            button_text = like_button_element.text.strip()
            match = re.search(r'\d+', button_text)
            if match:
                like_count = int(match.group(0))
            else:
                like_count = 0
            # print(f"   Extracted likes for review {index + 1}: {like_count}") # Uncomment for detailed debugging
        except Exception as e:
            # print(f"   Could not find like button for review {index + 1} inside iframe using selector '{like_button_selector}': {e}") # Uncomment for detailed debugging
            like_count = 0
            pass # Element not found for this specific review
        
# --- NEW APPROACH: First find the header element ---
        try:
            header_element = review_element.find_element(By.CSS_SELECTOR, header_element_selector)
            # print(f"Review {index+1}: Header element found.") # For debugging

            # --- Extract Date from within the header ---
            try:
                date_element = header_element.find_element(By.CSS_SELECTOR, date_selector_within_header)
                review_date = date_element.text.strip()
            except Exception as e:
                # print(f"Review {index+1}: Could not find date within header: {e}")
                pass

            # --- Extract Star Rating from within the header ---
            try:
                rating_text_element = header_element.find_element(By.CSS_SELECTOR, rating_text_selector_within_header)
                full_rating_text = rating_text_element.text.strip()
                rating_match = re.search(r'(\d+)', full_rating_text) # Extracts the first number
                if rating_match:
                    star_rating = int(rating_match.group(1))
                else:
                    # Fallback: If text parsing fails, try counting star SVGs within the header
                    try:
                        star_svg_selector_within_header = "div.ui-review-capability-comments__comment__rating svg.ui-review-capability-comments__comment__rating_star"
                        star_svgs = header_element.find_elements(By.CSS_SELECTOR, star_svg_selector_within_header)
                        if len(star_svgs) > 0 : # Ensure we found some before assigning
                           star_rating = len(star_svgs)
                        # print(f"Review {index+1}: Found {star_rating} star SVGs as fallback.") # For debugging
                    except Exception as e_svg:
                        # print(f"Review {index+1}: Could not count star SVGs: {e_svg}")
                        pass # Keep previous N/A or parsed value if SVG counting fails
            except Exception as e:
                # print(f"Review {index+1}: Could not find rating text within header: {e}")
                # Fallback to SVG count if rating_text_element itself not found
                try:
                    star_svg_selector_within_header = "div.ui-review-capability-comments__comment__rating svg.ui-review-capability-comments__comment__rating_star"
                    star_svgs = header_element.find_elements(By.CSS_SELECTOR, star_svg_selector_within_header)
                    if len(star_svgs) > 0 :
                        star_rating = len(star_svgs)
                    # print(f"Review {index+1}: Found {star_rating} star SVGs as primary fallback.") # For debugging
                except Exception as e_svg_primary_fallback:
                    # print(f"Review {index+1}: Could not count star SVGs on primary fallback: {e_svg_primary_fallback}")
                    pass


        except Exception as e:
            # print(f"Review {index+1}: Could not find header element: {e}")
            pass # If header is not found, date and rating will remain "N/A"


        reviews_data.append({
            "index": index,
            "review": review_text,
            "likes": like_count,
            "stars": star_rating,
            "date": review_date
        })
    print("Data extraction complete.")
else:
    print("No user review elements found inside iframe to extract data from after attempting to scroll.")


# --- Switch back to the Default Content ---
try:
    driver.switch_to.default_content()
    print("Switched back to default content.")
except Exception as e:
    print(f"Could not switch back to default content: {e}")


# --- Process or Store the Data ---
print("\n--- Extracted User Reviews ---")
if reviews_data:
    for i, item in enumerate(reviews_data):
        print(f"Review {i + 1}:")
        print(f"   Text: {item['review']}")
        print(f"   Likes: {item['likes']}")
        print(f"   Stars: {item['stars']}")
        print(f"   Likes: {item['date']}")
        print("-" * 20)
else:
    print("No data was extracted.")

df = pd.DataFrame(reviews_data)
df.to_excel('review_data.xlsx', index=False) # Saving the data in an excel sheet

# --- Close the Browser ---
print("Closing browser.")
driver.quit()

Chrome WebDriver installed and initialized using WebDriver Manager.
Checking for and attempting to dismiss cookie consent banner...
Cookie consent banner dismissed.
Attempting to find and click 'Mostrar todas las opiniones' button...
'Mostrar todas las opiniones' button clicked.
Attempting to switch to iframe with selector: [data-testid='ui-pdp-iframe-reviews']
Switched to reviews iframe.
Scrolling down to load more reviews within the iframe...
Scroll attempt 1, current height: 3335.333251953125, new height: 6674, last height: 3815
Scroll attempt 2, current height: 6194, new height: 8818, last height: 6674
Scroll attempt 3, current height: 8338, new height: 10984, last height: 8818
Scroll attempt 4, current height: 10504, new height: 13358, last height: 10984
Scroll attempt 5, current height: 12878, new height: 15769, last height: 13358
Scroll attempt 6, current height: 15288.6669921875, new height: 17913, last height: 15769
Scroll attempt 7, current height: 17432.666015625, new height

Optional - random user agent

In [None]:
#add user to avoid 429 error
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
]

options = webdriver.ChromeOptions()
options.add_argument(f"user-agent={random.choice(user_agents)}")
