In [2]:
pip install --user webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver_manager
Successfully installed webdriver_manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import time
import re
import os
import csv 
import random
import pandas as pd
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException

# Scraping link

Simplified version (loop change page #)

In [6]:
# --- Global Data List ---
extracted_data = []

# --- Helper function to safely get text and convert ---
def get_element_text_or_default(product_element, selector, default_value, data_type='text'):
    """
    Tries to find an element, get its text, clean it, and return it.
    Returns the default_value if not found or on error.
    """
    try:
        element = product_element.find_element(By.CSS_SELECTOR, selector)
        text = element.text.strip()

        if not text:
            return default_value

        if data_type == 'float':
            cleaned_text = text.replace(',', '')
            return float(cleaned_text)
        elif data_type == 'int':
            match = re.search(r'\d+', text.replace(',', ''))
            return int(match.group(0)) if match else default_value
        else: # 'text'
            return text
    except (NoSuchElementException, ValueError, AttributeError):
        return default_value

# --- Your cookie_handler_button function (Unchanged) ---
def cookie_handler_button(driver, cookie_accept_button_selector):
    try:
        print("Checking for and attempting to dismiss cookie consent banner...")
        wait = WebDriverWait(driver, 5)
        cookie_buttons = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, cookie_accept_button_selector)))
        if cookie_buttons:
            try:
                cookie_buttons[0].click()
            except Exception:
                 print("Normal click failed, trying JavaScript click...")
                 driver.execute_script("arguments[0].click();", cookie_buttons[0])
            print("Cookie consent banner dismissed.")
            time.sleep(2)
        else:
            print("No cookie consent banner found.")
    except TimeoutException:
         print("No cookie consent banner found within the timeout period.")
    except Exception as e:
        print(f"Could not dismiss cookie consent banner: {e}")
    return None

# --- Your product_finder function (Unchanged from last version) ---
def product_finder(driver, url, wait, product_container_selector, single_product_selector):
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, product_container_selector)))
        print("Product container found. Attempting to find all product elements.")
        product_elements_list = driver.find_elements(By.CSS_SELECTOR, single_product_selector)

        if product_elements_list:
            print(f"Found {len(product_elements_list)} products on this page.")
            for index, product_element in enumerate(product_elements_list):
                print(f"\n--- Product {index + 1} ---")

                product_url = "Not found"
                product_title = "Not found"
                product_offer_type = "na"
                product_rating = 0.0
                product_review_count = 0
                product_old_price = 0.0
                product_new_price = 0.0

                try:
                    link_element = product_element.find_element(By.CSS_SELECTOR, 'a.poly-component__title')
                    product_url = link_element.get_attribute('href')
                    product_title = link_element.text.strip()
                    print(f"Title: {product_title}")
                    print(f"URL: {product_url}")
                except NoSuchElementException:
                    print("Could not find the title/link element for this product. Skipping.")
                    continue

                product_offer_type = get_element_text_or_default(
                    product_element, 'span.poly-component__highlight', "na", 'text'
                )
                print(f"Offer Type: {product_offer_type}")

                product_rating = get_element_text_or_default(
                    product_element, 'span.poly-reviews__rating', 0.0, 'float'
                )
                print(f"Rating: {product_rating}")

                product_review_count = get_element_text_or_default(
                    product_element, 'span.poly-reviews__total', 0, 'int'
                )
                print(f"Review Count: {product_review_count}")

                product_old_price = get_element_text_or_default(
                    product_element, 'div.andes-money-amount--previous span.andes-money-amount__fraction', 0.0, 'float'
                )
                print(f"Old Price: {product_old_price}")

                product_new_price = get_element_text_or_default(
                    product_element, 'div.poly-price__current span.andes-money-amount__fraction', 0.0, 'float'
                )
                print(f"New Price: {product_new_price}")

                extracted_data.append({
                    'title': product_title,
                    'url': product_url,
                    'offer_type': product_offer_type,
                    'rating': product_rating,
                    'review_count': product_review_count,
                    'old_price': product_old_price,
                    'new_price': product_new_price
                })

            return len(product_elements_list)
        else:
            print("No products found matching the selector on this page.")
            return 0

    except TimeoutException:
        print(f"Error in product_finder: Product container '{product_container_selector}' not found on {url}.")
        return 0
    except Exception as e:
        print(f"Error in product_finder: {e}")
        return 0

# --- Main Execution Block ---
if __name__ == "__main__":
    try:
        service = Service(ChromeDriverManager().install())
        print("Chrome WebDriver installed and initialized.")
    except Exception as e:
        print(f"Error initializing Chrome WebDriver: {e}")
        exit()

    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 20)

    base_url = "https://www.mercadolibre.com.mx/ofertas"
    cookie_accept_button_selector = "[data-testid='action:understood-button']"
    product_container_selector = "div.items-list" # <-- Updated selector
    single_product_selector = 'div.poly-card--grid-card'

    max_pages_to_scrape_total = 20
    current_page_number = 13

    first_page_url = f"{base_url}?page={current_page_number}"
    print(f"Navigating to the first page: {first_page_url}")
    driver.get(first_page_url)
    cookie_handler_button(driver, cookie_accept_button_selector)

    while current_page_number <= max_pages_to_scrape_total:
        print(f"\n--- Scraping Page {current_page_number} ---")
        current_url = f"{base_url}?page={current_page_number}"

        if current_page_number > 1:
            print(f"Navigating to: {current_url}")
            driver.get(current_url)
            time.sleep(3)

        products_found = product_finder(
            driver,
            current_url,
            wait,
            product_container_selector,
            single_product_selector
        )

        if products_found == 0 and current_page_number > 1:
            print(f"No products found on page {current_page_number}. Stopping.")
            break

        current_page_number += 1

    # --- Print All Extracted Data (Optional, but good for checking) ---
    print("\n--- All Extracted Data ---")
    if extracted_data:
        for item_index, item_data in enumerate(extracted_data):
            print(f"Item {item_index + 1}: {item_data}")
    else:
        print("No data was extracted.")


    # --- SAVE DATA TO CSV (UPDATED) ---
    print("\n--- Saving Data to CSV ---")
    if extracted_data:
        # Define the path structure using pathlib
        project_root = Path("C:/Users/giaco/Documents/JAKE/Education/DSS/II Semester/Block 4/scraping_assigment")
        base_data_dir = project_root / "data" / "URLs" # Use / to join path components
        
        # Create the directory (and any parent directories) if it doesn't exist
        base_data_dir.mkdir(parents=True, exist_ok=True)

        # Define the full file path including the desired name
        file_path = base_data_dir / "mercadolibre_ofertas_2.csv"
        
        print(f"Saving to: {file_path}")

        # Get the keys from the first dictionary
        fieldnames = extracted_data[0].keys()

        try:
            # Open the file using the Path object
            with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(extracted_data)

            print(f"\nSuccessfully saved {len(extracted_data)} items to {file_path}")

        except IOError as e:
            print(f"Error saving data to CSV: {e}")
        except Exception as e:
            print(f"An unexpected error occurred during CSV saving: {e}")

    else:
        print("No data to save.")
    # --- END OF CSV SAVING ---

    # --- Close the Browser ---
    print("\nClosing browser.")
    driver.quit()

Chrome WebDriver installed and initialized.
Navigating to the first page: https://www.mercadolibre.com.mx/ofertas?page=13
Checking for and attempting to dismiss cookie consent banner...
Cookie consent banner dismissed.

--- Scraping Page 13 ---
Navigating to: https://www.mercadolibre.com.mx/ofertas?page=13
Product container found. Attempting to find all product elements.
Found 54 products on this page.

--- Product 1 ---
Title: Guitarra Bigview Clásica 38in Para Diestros Color Negro Con Paquete De Accesorios Qc-38
URL: https://www.mercadolibre.com.mx/guitarra-bigview-clasica-38in-para-diestros-color-negro-con-paquete-de-accesorios-qc-38/p/MLM39437145?pdp_filters=item_id%3AMLM2126791193#polycard_client=offers&deal_print_id=d3af09e7-a783-4b84-881a-29d91e7eaba8&tracking_id=8085191b-5369-4b32-b3c4-7f57224ab2dc&wid=MLM2126791193&sid=offers
Offer Type: MÁS VENDIDO
Rating: 4.5
Review Count: 326
Old Price: 0.0
New Price: 985.0

--- Product 2 ---
Title: Reloj Smartwatch 2.01'' Reloj Inteligente

## Merging csv

In [None]:

# --- Define File Paths ---
# Adjust this to your actual project root if different
project_root = Path("C:/Users/giaco/Documents/JAKE/Education/DSS/II Semester/Block 4/scraping_assigment")
base_data_dir = project_root / "data" / "URLs"

# Names of your input files
file1_name = "mercadolibre_ofertas.csv"
file2_name = "mercadolibre_ofertas_2.csv" # Assuming this is the second file

# Name for your output merged file
merged_file_name = "mercadolibre_ofertas_merged.csv"

# Construct full paths
file1_path = base_data_dir / file1_name
file2_path = base_data_dir / file2_name
merged_file_path = base_data_dir / merged_file_name

print(f"Attempting to merge:\n  1: {file1_path}\n  2: {file2_path}")
print(f"Output will be saved to: {merged_file_path}")

try:
    # --- Read the CSV files into pandas DataFrames ---
    print(f"\nReading {file1_name}...")
    df1 = pd.read_csv(file1_path, encoding='utf-8')
    print(f"Read {len(df1)} rows from {file1_name}.")

    print(f"\nReading {file2_name}...")
    df2 = pd.read_csv(file2_path, encoding='utf-8')
    print(f"Read {len(df2)} rows from {file2_name}.")

    # --- Concatenate (merge) the DataFrames ---
    # This simply appends df2 to the end of df1
    merged_df = pd.concat([df1, df2], ignore_index=True)
    print(f"\nTotal rows after merging (before duplicate check): {len(merged_df)}")

    # --- Optional: Remove duplicate rows ---
    # If there's a chance of overlapping data, you might want to remove duplicates.
    # This keeps the first occurrence of a duplicate row.
    # You can specify a subset of columns to consider for identifying duplicates,
    # e.g., subset=['url'] if the 'url' should be unique.
    # If all columns should be identical for a row to be a duplicate, don't specify subset.
    initial_row_count = len(merged_df)
    merged_df.drop_duplicates(inplace=True) # inplace=True modifies the DataFrame directly
    rows_dropped = initial_row_count - len(merged_df)
    if rows_dropped > 0:
        print(f"Removed {rows_dropped} duplicate rows.")
    print(f"Total rows after duplicate check: {len(merged_df)}")


    # --- Save the merged DataFrame to a new CSV file ---
    merged_df.to_csv(merged_file_path, index=False, encoding='utf-8')
    print(f"\nSuccessfully merged data and saved to {merged_file_path}")

except FileNotFoundError as e:
    print(f"\nError: One of the input files was not found.")
    print(e)
except Exception as e:
    print(f"\nAn error occurred: {e}")

In [None]:
def classify_urls_in_csv(input_csv_path, output_csv_path):
    """
    Reads a CSV file, classifies URLs based on the presence of "JM",
    and saves the result to a new CSV file.

    Args:
        input_csv_path (Path or str): The path to the input CSV file.
        output_csv_path (Path or str): The path to save the modified CSV file.
    """
    try:
        print(f"Reading {input_csv_path}...")
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(input_csv_path, encoding='utf-8') 

        # Ensure the 'url' column exists
        if 'url' not in df.columns:
            print(f"Error: The file {input_csv_path} does not have a 'url' column.")
            return

        print("Classifying URLs...")
        # Create the 'JM' column
        # 1 if JM is present 0 if not. the web page they have different strucuture
        df['JM'] = df['url'].astype(str).str.contains("JM", na=False).astype(int)

        # Save the modified DataFrame to a new CSV file
        df.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"Successfully processed and saved {len(df)} rows to {output_csv_path}")

    except FileNotFoundError:
        print(f"Error: The file {input_csv_path} was not found.")
    except Exception as e:
        print(f"An error occurred while processing {input_csv_path}: {e}")

# --- Main Execution Block ---
if __name__ == "__main__":
    print("Starting URL classification process...")
    
    # --- Configuration using pathlib ---
    # !! Ensure this project_root path is correct !!
    project_root = Path("C:/Users/giaco/Documents/JAKE/Education/DSS/II Semester/Block 4/scraping_assigment")
    base_data_dir = project_root / "data" / "URLs"

    # Define the input file (UPDATED to 'ofertas')
    input_filename = "mercadolibre_ofertas_merged.csv" # <--- UPDATED
    input_file_path = base_data_dir / input_filename

    # Define the output file (UPDATED to 'ofertas')
    output_filename = "mercadolibre_ofertas_merged_classified.csv" # <--- UPDATED
    output_file_path = base_data_dir / output_filename
    
    # --- Processing ---
    # Check if the specific input file exists
    if input_file_path.exists():
        classify_urls_in_csv(input_file_path, output_file_path)
    else:
        print(f"Error: The input file {input_file_path} was not found. Please ensure it exists.")

    print("\nProcessing complete.")

Starting URL classification process...
Reading C:\Users\giaco\Documents\JAKE\Education\DSS\II Semester\Block 4\scraping_assigment\data\URLs\mercadolibre_ofertas_merged.csv...
Classifying URLs...
Successfully processed and saved 1065 rows to C:\Users\giaco\Documents\JAKE\Education\DSS\II Semester\Block 4\scraping_assigment\data\URLs\mercadolibre_ofertas_merged_classified.csv

Processing complete.


In [None]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv("../data/URLs/mercadolibre_ofertas_merged_classified.csv")
print(df.head(60))

Subset

In [13]:
# --- Configuration ---
# Define the path to your project and data directory
project_root = Path("C:/Users/giaco/Documents/JAKE/Education/DSS/II Semester/Block 4/scraping_assigment")
base_data_dir = project_root / "data" / "URLs"

# Define the input file (the one with the 'JM' column)
input_filename = "mercadolibre_ofertas_merged_classified.csv"
input_file_path = base_data_dir / input_filename

print(f"Processing file: {input_file_path}")

try:
    # --- Read the CSV file ---
    df = pd.read_csv(input_file_path, encoding='utf-8')
    print(f"Successfully read {len(df)} rows.")

    # --- Check for necessary columns ---
    if 'JM' not in df.columns or 'review_count' not in df.columns:
        print("Error: The CSV must contain both 'JM' and 'review_count' columns.")
    else:
        # --- Apply the filters ---
        # Condition 1: JM column is exactly 0
        condition1 = df['JM'] == 0
        
        # Condition 2: review_count column is greater than 100
        condition2 = df['review_count'] > 100

        # Combine both conditions using the logical AND (&) operator
        # This selects rows where *both* conditions are True
        subset_df = df[condition1 & condition2]

        # --- Get and print the length ---
        subset_length = len(subset_df)
        print(f"\nNumber of products with 'JM' = 0 AND 'review_count' > 100: {subset_length}")

        # Optional: If you want to save this subset to a new CSV:
        output_subset_filename = "mercadolibre_ofertas_subset.csv"
        output_subset_path = base_data_dir / output_subset_filename
        subset_df.to_csv(output_subset_path, index=False, encoding='utf-8')
        print(f"Subset saved to {output_subset_path}")


except FileNotFoundError:
    print(f"Error: The file {input_file_path} was not found. Please check the path and filename.")
except Exception as e:
    print(f"An error occurred: {e}")

Processing file: C:\Users\giaco\Documents\JAKE\Education\DSS\II Semester\Block 4\scraping_assigment\data\URLs\mercadolibre_ofertas_merged_classified.csv
Successfully read 1065 rows.

Number of products with 'JM' = 0 AND 'review_count' > 100: 490
Subset saved to C:\Users\giaco\Documents\JAKE\Education\DSS\II Semester\Block 4\scraping_assigment\data\URLs\mercadolibre_ofertas_subset.csv


# Price + Review finder

In [None]:
# --- cookie_handler_button (Keep as is) ---
def cookie_handler_button(driver, cookie_accept_button_selector, wait_time=10):
    try:
        print(f"Checking for cookie banner (waiting up to {wait_time}s)...")
        wait = WebDriverWait(driver, wait_time)
        cookie_button = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, cookie_accept_button_selector))
        )
        print("Cookie button found. Clicking...")
        try:
            cookie_button.click()
            print("Cookie banner dismissed via standard click.")
        except ElementClickInterceptedException:
            print("Standard click intercepted, trying JavaScript click...")
            driver.execute_script("arguments[0].click();", cookie_button)
            print("Cookie banner dismissed via JavaScript click.")
        time.sleep(1)
        return True
    except TimeoutException:
        print("No cookie banner found/clickable.")
        return False
    except Exception as e:
        print(f"Error handling cookie banner: {e}")
        return False

# --- check_page_status (Keep as is, but check 'h1.ui-pdp-title') ---
def check_page_status(driver, wait_short):
    try:
        if driver.find_elements(By.ID, "reload-button"):
            print("Page Status: RATE_LIMITED_BUTTON")
            return "RATE_LIMITED_BUTTON"
        if '"status":429' in driver.page_source and '"local_rate_limited"' in driver.page_source:
            print("Page Status: RATE_LIMITED_JSON")
            return "RATE_LIMITED_JSON"
        # Check for a reliable element on a *product* page
        wait_short.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.ui-pdp-title')))
        print("Page Status: OK")
        return "OK"
    except TimeoutException:
        print("Page Status: UNKNOWN_ERROR (Timeout/Not OK)")
        return "UNKNOWN_ERROR"
    except Exception as e:
        print(f"Page Status: UNKNOWN_ERROR ({e})")
        return "UNKNOWN_ERROR"

# --- find_product_details (MODIFIED: Only get condition) ---
def find_product_details(driver, wait):
    """
    Finds ONLY the condition of a product on its details page.

    Args:
        driver: The Selenium WebDriver instance.
        wait: The WebDriverWait instance.

    Returns:
        str: The product condition (e.g., 'Nuevo', 'Usado') or 'Unknown'.
    """
    condition_selector = 'span.ui-pdp-subtitle'
    condition = "Unknown" # Default to Unknown

    try:
        print(f"Looking for condition element ({condition_selector})...")
        # Use a wait to ensure the element is present before checking
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, condition_selector)))
        condition_elements = driver.find_elements(By.CSS_SELECTOR, condition_selector)
        
        if condition_elements:
            full_condition_text = condition_elements[0].text.strip()
            print(f"Full condition text found: {full_condition_text}")
            # Extract the part before the first '|' or use the whole string
            condition = full_condition_text.split('|')[0].strip()
            print(f"Cleaned condition found: {condition}")
        else:
            # If the subtitle span isn't found, it might be 'Nuevo' implicitly
            # or the selector might be wrong. 'Unknown' is safer if unsure.
            print("Condition element not found, setting to 'Unknown'.")

    except TimeoutException:
         print("Condition element not found within timeout, setting to 'Unknown'.")
    except Exception as e:
        print(f"An error occurred while finding the condition: {e}")

    return condition

# --- scrape_reviews (Keep as is) ---
def scrape_reviews(driver, wait):
    reviews_data = []
    iframe_switched = False
    show_more_button_selector = "[data-testid='see-more']"
    iframe_selector = "[data-testid='ui-pdp-iframe-reviews']"
    review_elements_selector = "[data-testid='comment-component']"
    review_text_element_selector = "[data-testid='comment-content-component']"
    like_button_selector = "[data-testid='like-button']"
    header_element_selector = "div.ui-review-capability-comments__comment__header"
    date_selector_within_header = "span.ui-review-capability-comments__comment__date"
    rating_text_selector_within_header = "div.ui-review-capability-comments__comment__rating > p.andes-visually-hidden"

    try:
        try:
            print("Attempting to click 'Mostrar todas las opiniones'...")
            show_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, show_more_button_selector)))
            driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
            time.sleep(0.5)
            show_more_button.click()
            print("Button clicked.")
            time.sleep(7)
        except Exception as e:
            print(f"INFO: Could not click 'Mostrar todas': {e}. Proceeding...")

        print("Attempting to switch to iframe...")
        wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, iframe_selector)))
        iframe_switched = True
        print("Switched to iframe.")

        print("Scrolling within iframe...")
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempts = 0
        while scroll_attempts < 15:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(5, 10))
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height and new_height > 0: break
            last_height = new_height
            scroll_attempts += 1
        print("Finished scrolling.")

        print("Extracting reviews...")
        review_elements = driver.find_elements(By.CSS_SELECTOR, review_elements_selector)
        print(f"Found {len(review_elements)} reviews.")

        for index, review_element in enumerate(review_elements):
            review_text, review_date, star_rating = "N/A", "N/A", "N/A"
            like_count = 0
            try: review_text = review_element.find_element(By.CSS_SELECTOR, review_text_element_selector).text.strip()
            except NoSuchElementException: pass
            try:
                button_text = review_element.find_element(By.CSS_SELECTOR, like_button_selector).text.strip()
                match = re.search(r'\d+', button_text)
                like_count = int(match.group(0)) if match else 0
            except NoSuchElementException: pass
            try:
                header = review_element.find_element(By.CSS_SELECTOR, header_element_selector)
                try: review_date = header.find_element(By.CSS_SELECTOR, date_selector_within_header).text.strip()
                except NoSuchElementException: pass
                try:
                    rating_text = header.find_element(By.CSS_SELECTOR, rating_text_selector_within_header).text.strip()
                    match = re.search(r'(\d+)', rating_text)
                    star_rating = int(match.group(1)) if match else "N/A"
                except NoSuchElementException: pass
            except NoSuchElementException: pass
            reviews_data.append({
                "review_index": index + 1, "review_text": review_text, "review_likes": like_count,
                "review_stars": star_rating, "review_date": review_date
            })
    except Exception as e:
        print(f"Error scraping reviews: {e}")
    finally:
        if iframe_switched:
            print("Switching back to default content.")
            driver.switch_to.default_content()
    return reviews_data

# --- get_urls_to_scrape (MODIFIED: Read correct file, apply both filters, return full dicts) ---
def get_products_to_scrape(csv_filepath):
    """
    Reads a CSV file, filters for rows where 'JM' is 0 and 'review_count' > 100,
    and returns a list of dictionaries containing all original data.

    Args:
        csv_filepath (Path or str): The path to the input CSV file.

    Returns:
        list: A list of full dictionaries, or an empty list on error.
    """
    all_products_to_scrape = []
    try:
        df = pd.read_csv(csv_filepath, encoding='utf-8')
        print(f"Successfully read {csv_filepath}. Found {len(df)} rows.")

        # --- Input Validation ---
        required_cols = ['url', 'JM', 'review_count']
        if not all(col in df.columns for col in required_cols):
            print(f"Error: CSV '{csv_filepath}' must contain {required_cols}. Aborting.")
            return []

        # --- Filtering ---
        df['JM'] = pd.to_numeric(df['JM'], errors='coerce')
        df['review_count'] = pd.to_numeric(df['review_count'], errors='coerce')
        df.dropna(subset=['JM', 'review_count'], inplace=True) # Drop rows where conversion failed

        condition1 = df['JM'] == 0
        condition2 = df['review_count'] > 100
        filtered_df = df[condition1 & condition2]

        all_products_to_scrape = filtered_df.to_dict('records')
        print(f"Found {len(all_products_to_scrape)} products matching criteria (JM=0 & Reviews>100).")

    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_filepath}")
    except Exception as e:
        print(f"An error occurred while reading or filtering '{csv_filepath}': {e}")

    return all_products_to_scrape

# --- save_to_csv (MODIFIED: Include all data, use pathlib) ---
def save_to_csv(scraped_data, output_filepath):
    """
    Flattens the scraped data (including original info) and saves it to a CSV file.
    """
    flattened_list = []
    print("\nPreparing data for saving...")

    for product in scraped_data:
        # Get original data, use .get() for safety
        base_data = {
            'title': product.get('title'),
            'offer_type': product.get('offer_type'),
            'rating': product.get('rating'),
            'review_count': product.get('review_count'),
            'old_price': product.get('old_price'),
            'new_price': product.get('new_price'),
            'JM': product.get('JM'),
            'condition': product.get('condition_scraped', 'N/A') # Use newly scraped condition
        }
        reviews = product.get('reviews', [])

        if reviews:
            for review in reviews:
                row = base_data.copy() # Start with all base data
                row.update(review)     # Add review data
                flattened_list.append(row)
        else: # If no reviews, add one row with N/A for review columns
            row = base_data.copy()
            row.update({
                "review_index": 'N/A', "review_text": 'N/A', "review_likes": 'N/A',
                "review_stars": 'N/A', "review_date": 'N/A'
            })
            flattened_list.append(row)

    if not flattened_list:
        print("No data to save.")
        return

    df = pd.DataFrame(flattened_list)

    # Define the order of columns (ensure all desired columns are listed)
    output_columns = [
        'title', 'offer_type', 'rating', 'review_count', 'old_price', 
        'new_price', 'JM', 'condition', 'review_index', 'review_text', 
        'review_likes', 'review_stars', 'review_date'
    ]
    # Reorder and only keep these columns
    df = df[output_columns]

    try:
        # Ensure directory exists (handled by pathlib in main)
        print(f"Attempting to save {len(df)} rows to {output_filepath}...")
        df.to_csv(output_filepath, index=False, encoding='utf-8-sig') # utf-8-sig is often better for Excel
        print(f"Successfully saved data to {output_filepath}")
    except Exception as e:
        print(f"Error saving data to CSV at {output_filepath}: {e}")


# --- Main Execution Block (MODIFIED) ---
if __name__ == "__main__":
    # --- Setup ---
    try:
        options = webdriver.ChromeOptions()
        # Add a more common user agent
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        print("Chrome WebDriver initialized.")
        wait = WebDriverWait(driver, 20) 
        wait_short = WebDriverWait(driver, 5)
        all_scraped_data = [] # Will hold original_data + new_condition + reviews
    except Exception as e:
        print(f"Error initializing Chrome WebDriver: {e}")
        exit()

    # --- Path Config ---
    project_root = Path("C:/Users/giaco/Documents/JAKE/Education/DSS/II Semester/Block 4/scraping_assigment")
    base_data_dir = project_root / "data" / "URLs"
    input_filename = "mercadolibre_ofertas_merged_classified.csv"
    output_filename = "mercadolibre_final_reviews_data.csv" # New name for final output
    input_csv_path = base_data_dir / input_filename
    output_csv_path = base_data_dir / output_filename
    cookie_accept_button_selector = "button[data-testid='action:understood-button']"

    # --- Get Products (Using modified function) ---
    products_to_process = get_products_to_scrape(input_csv_path)

    # --- Main Loop Config ---
    rate_limit_pause_seconds = 120
    max_retries = 1

# --- Main Loop ---
    for product_data in products_to_process:
        url = product_data.get('url')
        title = product_data.get('title', 'N/A')
        attempts = 0
        status = "INIT"
        
        if not url:
            print(f"Skipping product with missing URL: {title}")
            continue

        print(f"\n=======================================")
        print(f"Processing: {title} | URL: {url}")
        print(f"=======================================")

        while attempts <= max_retries:
            attempts += 1
            print(f"--- Attempt {attempts} ---")
            
            try:
                # Load page (unless it's a retry after clicking Ricarica)
                if not (attempts > 1 and status == "RATE_LIMITED_BUTTON_CLICKED"): # Check new status
                    driver.get(url)
                    time.sleep(random.uniform(2, 4))
                    cookie_handler_button(driver, cookie_accept_button_selector, wait_time=10)

                status = check_page_status(driver, wait_short) # Check status

                # --- Handle Rate Limiting (MODIFIED) ---
                if status == "RATE_LIMITED_BUTTON":
                    if attempts > max_retries: break # Already retried, break loop
                    
                    print(f"--- RATE LIMITED (Button). Waiting {rate_limit_pause_seconds}s...")
                    time.sleep(rate_limit_pause_seconds) # Wait 120 seconds
                    
                    print("--- Attempting to click 'Ricarica' button... ---")
                    try:
                        # WAIT for the button to be clickable
                        reload_btn = wait.until(
                            EC.element_to_be_clickable((By.ID, "reload-button"))
                        )
                        # Try standard click, then JS click
                        try:
                            reload_btn.click()
                            print("Clicked 'Ricarica' via standard click.")
                        except ElementClickInterceptedException:
                            print("Standard click failed, trying JS click...")
                            driver.execute_script("arguments[0].click();", reload_btn)
                            print("Clicked 'Ricarica' via JS click.")
                            
                        status = "RATE_LIMITED_BUTTON_CLICKED" # Set a new status to avoid re-getting URL
                        time.sleep(5) # Wait for click reload
                        continue # Loop for next attempt (will check status again)
                        
                    except Exception as click_err:
                        print(f"Error clicking reload button: {click_err}")
                        status = "CRITICAL_ERROR" 
                        break # Critical error, stop trying

                elif status == "RATE_LIMITED_JSON":
                    if attempts > max_retries: break 
                    print(f"--- RATE LIMITED (JSON). Waiting {rate_limit_pause_seconds}s and reloading URL...")
                    time.sleep(rate_limit_pause_seconds)
                    status = "INIT" # Reset status so it reloads
                    continue 

                # If not rate-limited or if retries exhausted, break the while loop
                break

            except Exception as e:
                print(f"--- CRITICAL ERROR during attempt {attempts} for {url}: {e} ---")
                status = "CRITICAL_ERROR"
                break

        # --- Process the FINAL status (Unchanged) ---
        enriched_product_data = product_data.copy()
        if status == "OK":
            condition = find_product_details(driver, wait)
            reviews = scrape_reviews(driver, wait)
            enriched_product_data['condition_scraped'] = condition
            enriched_product_data['reviews'] = reviews
            print(f"--- SUCCESS: Scraped {url} ---")
        else:
            # If we ended after clicking, it's still a fail for this product
            if status == "RATE_LIMITED_BUTTON_CLICKED": status = "RATE_LIMITED_BUTTON" 
            enriched_product_data['condition_scraped'] = status 
            enriched_product_data['reviews'] = [] 
            print(f"--- FAILED ({status}): Could not scrape {url} ---")
        all_scraped_data.append(enriched_product_data)

        # --- MODIFIED: Set a fixed 45-second delay before the next URL ---
        print(f"Waiting for 45 seconds before next URL...")
        time.sleep(45)

    # --- Results (Keep as is) ---
    save_to_csv(all_scraped_data, output_csv_path) 
    driver.quit()
    print("Driver closed.")

Error initializing Chrome WebDriver: name 'webdriver' is not defined


NameError: name 'Path' is not defined

: 

# Product Scraping

In [None]:
data =[]

# --- Setup ---
try:
    service = Service(ChromeDriverManager().install())
    print("Chrome WebDriver installed and initialized using WebDriver Manager.")
except Exception as e:
    print(f"Error initializing Chrome WebDriver with WebDriver Manager: {e}")
    print("Please ensure Chrome is installed and webdriver-manager is correctly installed (`pip install webdriver-manager`).")
    exit()

options = webdriver.ChromeOptions()
# options.add_argument('--headless') # Headless might make iframe interaction tricky, keep it off for now
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=service, options=options)

url_base = "https://www.mercadolibre.com.mx/apple-iphone-13-128-gb-blanco-estelar-distribuidor-autorizado/p/MLM1018500855#reviews"


driver.get(url_base)

wait = WebDriverWait(driver, 20) # Increased wait time again

# --- Handle Cookie Consent Banner ---
cookie_accept_button_selector = "[data-testid='action:understood-button']"

try:
    print("Checking for and attempting to dismiss cookie consent banner...")
    time.sleep(3)
    cookie_buttons = driver.find_elements(By.CSS_SELECTOR, cookie_accept_button_selector)
    if cookie_buttons:
        cookie_buttons[0].click()
        print("Cookie consent banner dismissed.")
        time.sleep(3)
    else:
        print("No cookie consent banner found or dismissed.")
except Exception as e:
    print(f"Could not dismiss cookie consent banner: {e}")


# --- Click the "Mostrar todas las opiniones" button ---
show_more_button_selector = "[data-testid='see-more']"
try:
    print("Attempting to find and click 'Mostrar todas las opiniones' button...")
    show_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, show_more_button_selector)))
    show_more_button.click()
    print("'Mostrar todas las opiniones' button clicked.")
    # Give time for the iframe to load after the button click
    time.sleep(7)
except Exception as e:
    print(f"Could not find or click the 'Mostrar todas las opiniones' button: {e}")
    driver.quit() # Exit if we can't even get to the reviews

# --- Switch to the Reviews Iframe ---
iframe_selector = "[data-testid='ui-pdp-iframe-reviews']"
try:
    print(f"Attempting to switch to iframe with selector: {iframe_selector}")
    # Wait for the iframe to be present
    wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, iframe_selector)))
    print("Switched to reviews iframe.")
    # Now we are inside the iframe's document context
except Exception as e:
    print(f"Could not switch to reviews iframe using selector '{iframe_selector}': {e}")
    driver.quit() # Exit if we can't access the reviews iframe


# --- IMPORTANT: Scroll to load reviews within the Iframe ---
# We need to scroll the document *within the iframe*.
try:
    print("Scrolling down to load more reviews within the iframe...")
    # Execute script on the iframe's document body
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0
    while scroll_attempts < 15:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        current_time_scroll = current_sleep_duration = random.uniform(5, 10)
        time.sleep(current_time_scroll)
        new_height = driver.execute_script("return document.body.scrollHeight")

        print(f"Scroll attempt {scroll_attempts + 1}, current height: {driver.execute_script('return window.scrollY')}, new height: {new_height}, last height: {last_height}") # More detailed scroll debug

        if new_height == last_height:
            print("Scroll height did not change, assuming end of content or scrolling failed.")
            # You might need to find a specific scrollable element inside the iframe
            # if document.body scrolling isn't working.
            break
        last_height = new_height
        scroll_attempts += 1
    print("Finished scrolling within the iframe.")

except Exception as e:
    print(f"Could not perform scrolling within the iframe: {e}")


# --- Locate Review Elements (NOW INSIDE THE IFRAME) ---
review_elements_selector = "[data-testid='comment-component']" 
review_text_element_selector = "[data-testid='comment-content-component']" #
like_button_selector = "[data-testid='like-button']" 

# --- Define the selector for the header component within each review ---
header_element_selector = "div.ui-review-capability-comments__comment__header"
date_selector_within_header = "span.ui-review-capability-comments__comment__date"
rating_text_selector_within_header = "div.ui-review-capability-comments__comment__rating > p.andes-visually-hidden"



try:
    print(f"Waiting for review elements within the iframe with selector: {review_elements_selector}")
    # Wait for review elements to be present inside the iframe
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, review_elements_selector)))
    print("User review elements selector found inside iframe. Attempting to find all matching elements.")

    # Find ALL elements matching the selector *within the current iframe context*
    review_elements = driver.find_elements(By.CSS_SELECTOR, review_elements_selector)
    print(f"Found {len(review_elements)} potential user review elements inside iframe AFTER scrolling.")

except Exception as e:
    print(f"Could not find user review elements inside iframe using selector '{review_elements_selector}' after scrolling: {e}")
    review_elements = []


# --- Extract Data from Each Review Element (INSIDE IFRAME) ---
reviews_data = []

if review_elements:
    print("Starting data extraction from elements inside iframe...")
    for index, review_element in enumerate(review_elements):
        review_text = "N/A"
        like_count = 0
        review_date = "N/A" # Initialize new variable
        star_rating = "N/A" # Initialize new variable

        # Find review text element *within* the current review_element (still inside iframe)
        try:
            review_text_element = review_element.find_element(By.CSS_SELECTOR, review_text_element_selector)
            review_text = review_text_element.text.strip()
            # print(f"   Extracted text for review {index + 1}") # Uncomment for detailed debugging
        except Exception as e:
            # print(f"   Could not find review text for review {index + 1} inside iframe using selector '{review_text_element_selector}': {e}") # Uncomment for detailed debugging
            pass # Element not found for this specific review

        # Find the like BUTTON element *within* the current review_element (still inside iframe)
        try:
            # *** USING THE (POTENTIALLY INCORRECT) SELECTOR HERE - VERIFY INSIDE IFRAME ***
            like_button_element = review_element.find_element(By.CSS_SELECTOR, like_button_selector)
            button_text = like_button_element.text.strip()
            match = re.search(r'\d+', button_text)
            if match:
                like_count = int(match.group(0))
            else:
                like_count = 0
            # print(f"   Extracted likes for review {index + 1}: {like_count}") # Uncomment for detailed debugging
        except Exception as e:
            # print(f"   Could not find like button for review {index + 1} inside iframe using selector '{like_button_selector}': {e}") # Uncomment for detailed debugging
            like_count = 0
            pass # Element not found for this specific review
        
# --- NEW APPROACH: First find the header element ---
        try:
            header_element = review_element.find_element(By.CSS_SELECTOR, header_element_selector)
            # print(f"Review {index+1}: Header element found.") # For debugging

            # --- Extract Date from within the header ---
            try:
                date_element = header_element.find_element(By.CSS_SELECTOR, date_selector_within_header)
                review_date = date_element.text.strip()
            except Exception as e:
                # print(f"Review {index+1}: Could not find date within header: {e}")
                pass

            # --- Extract Star Rating from within the header ---
            try:
                rating_text_element = header_element.find_element(By.CSS_SELECTOR, rating_text_selector_within_header)
                full_rating_text = rating_text_element.text.strip()
                rating_match = re.search(r'(\d+)', full_rating_text) # Extracts the first number
                if rating_match:
                    star_rating = int(rating_match.group(1))
                else:
                    # Fallback: If text parsing fails, try counting star SVGs within the header
                    try:
                        star_svg_selector_within_header = "div.ui-review-capability-comments__comment__rating svg.ui-review-capability-comments__comment__rating_star"
                        star_svgs = header_element.find_elements(By.CSS_SELECTOR, star_svg_selector_within_header)
                        if len(star_svgs) > 0 : # Ensure we found some before assigning
                           star_rating = len(star_svgs)
                        # print(f"Review {index+1}: Found {star_rating} star SVGs as fallback.") # For debugging
                    except Exception as e_svg:
                        # print(f"Review {index+1}: Could not count star SVGs: {e_svg}")
                        pass # Keep previous N/A or parsed value if SVG counting fails
            except Exception as e:
                # print(f"Review {index+1}: Could not find rating text within header: {e}")
                # Fallback to SVG count if rating_text_element itself not found
                try:
                    star_svg_selector_within_header = "div.ui-review-capability-comments__comment__rating svg.ui-review-capability-comments__comment__rating_star"
                    star_svgs = header_element.find_elements(By.CSS_SELECTOR, star_svg_selector_within_header)
                    if len(star_svgs) > 0 :
                        star_rating = len(star_svgs)
                    # print(f"Review {index+1}: Found {star_rating} star SVGs as primary fallback.") # For debugging
                except Exception as e_svg_primary_fallback:
                    # print(f"Review {index+1}: Could not count star SVGs on primary fallback: {e_svg_primary_fallback}")
                    pass


        except Exception as e:
            # print(f"Review {index+1}: Could not find header element: {e}")
            pass # If header is not found, date and rating will remain "N/A"


        reviews_data.append({
            "index": index,
            "review": review_text,
            "likes": like_count,
            "stars": star_rating,
            "date": review_date
        })
    print("Data extraction complete.")
else:
    print("No user review elements found inside iframe to extract data from after attempting to scroll.")


# --- Switch back to the Default Content ---
try:
    driver.switch_to.default_content()
    print("Switched back to default content.")
except Exception as e:
    print(f"Could not switch back to default content: {e}")


# --- Process or Store the Data ---
print("\n--- Extracted User Reviews ---")
if reviews_data:
    for i, item in enumerate(reviews_data):
        print(f"Review {i + 1}:")
        print(f"   Text: {item['review']}")
        print(f"   Likes: {item['likes']}")
        print(f"   Stars: {item['stars']}")
        print(f"   Likes: {item['date']}")
        print("-" * 20)
else:
    print("No data was extracted.")

df = pd.DataFrame(reviews_data)
df.to_excel('review_data.xlsx', index=False) # Saving the data in an excel sheet

# --- Close the Browser ---
print("Closing browser.")
driver.quit()

Optional - random user agent

In [None]:
#add user to avoid 429 error
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
]

options = webdriver.ChromeOptions()
options.add_argument(f"user-agent={random.choice(user_agents)}")
