In [2]:
#!pip install selenium
import os
import csv
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

def scrape_traderjoes_page(url):
    """
    Scrapes a single Trader Joe's food category page.
    Returns a list of dictionaries with keys 'name' and 'url'.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    
    results = []
    try:
        driver.get(url)
        
        # allow initial scripts to run
        time.sleep(3)

        # Repeat scrolling to load all items
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Wait for product list
        locator = (By.CSS_SELECTOR, "li[class^='ProductList_productList__item__']")
        product_items = WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located(locator)
        )

        for item in product_items:
            try:
                # Finds product name in a <h2><a ...> element
                link_elem = item.find_element(By.CSS_SELECTOR, "h2 > a")
                product_name = link_elem.text.strip()
                product_href = link_elem.get_attribute("href")
                results.append({
                    "name": product_name,
                    "url": product_href
                })
            except Exception as e:
                pass

    finally:
        driver.quit()
    
    return results

if __name__ == "__main__":
    base_url = "https://www.traderjoes.com/home/products/category/food-8"
    all_products = []

    # Loop through pages in index
    for page in range(1, 200):
        if page == 1:
            url = base_url
        else:
            url = f"https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A{page}%7D"
        print(f"Scraping page {page}: {url}")
        try:
            page_products = scrape_traderjoes_page(url)
        except TimeoutException:
            print(f"Timeout encountered on page {page}. Stopping further scraping.")
            break
        all_products.extend(page_products)

    # Append the collected product data to CSV file
    csv_filename = "trader_joes_products_TEST.csv"
    file_exists = os.path.exists(csv_filename)
    with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["name", "url"])
        if not file_exists:
            writer.writeheader()
        writer.writerows(all_products)

    print(f"Scraping complete. Data appended to {csv_filename}")

Scraping page 111: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A111%7D
Scraping page 112: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A112%7D
Scraping page 113: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A113%7D
Scraping page 114: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A114%7D
Scraping page 115: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A115%7D
Scraping page 116: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A116%7D
Scraping page 117: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A117%7D
Scraping page 118: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A118%7D
Scraping page 119: https://www.traderjoes.com/home/products/category/food-8?filters=%7B%22page%22%3A119%7D
Scraping page 120: https://www.trader

In [2]:
!pip install selenium pandas
import re
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

def extract_numeric(value):
    """Extract a numeric value from a string whenever possible."""
    match = re.search(r"[\d.]+", value)
    if match:
        try:
            return float(match.group())
        except:
            return None
    return None

def scrape_traderjoes_product(url):
    """
    Scrapes a Trader Joe's product page and returns a dictionary with:
      - name, price, package_size, serving_size, calories_per_serving
      - serves_about, number of servings
      - each nutrient from the nutrition table as its own key.
    If multiple values are found for the same nutrient, the smallest numeric value is kept.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    product_data = {}
    
    try:
        driver.get(url)
        
        # Let page load initial scripts
        time.sleep(3)
        
        # 1) Attempt to select the "Per Serving" tab
        try:
            tab_bar = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div[class^='TabBar_tabBar__nav__']"))
            )
            buttons = tab_bar.find_elements(By.TAG_NAME, "button")
            for btn in buttons:
                text_content = btn.get_attribute("textContent").strip().lower()
                if "per serving" in text_content:
                    # Only click if not already active
                    if "Nav_active__" not in btn.get_attribute("class"):
                        btn.click()
                        time.sleep(1)
                    break
        except TimeoutException:
            pass

        # 2) Scrape product name
        try:
            title_elem = driver.find_element(By.CSS_SELECTOR, "h1[class^='ProductDetails_main__title__']")
            product_data["name"] = title_elem.get_attribute("textContent").strip()
        except Exception:
            product_data["name"] = ""
        
        # 3) Scrape price
        try:
            price_elem = driver.find_element(By.CSS_SELECTOR, "span[class^='ProductPrice_productPrice__price__']")
            product_data["price"] = price_elem.get_attribute("textContent").strip()
        except Exception:
            product_data["price"] = ""
        
        # 4) Scrape package size
        try:
            size_elem = driver.find_element(By.CSS_SELECTOR, "span[class^='ProductPrice_productPrice__unit__']")
            product_data["package_size"] = size_elem.get_attribute("textContent").strip()
        except Exception:
            product_data["package_size"] = ""
        
        # 5) Scrape serving size & calories per serving
        serving_size = ""
        calories_per_serving = ""
        try:
            characteristics = driver.find_elements(By.CSS_SELECTOR, "div[class^='Item_characteristics__item__']")
            for block in characteristics:
                try:
                    label_div = block.find_element(By.CSS_SELECTOR, "div[class^='Item_characteristics__title__']")
                    value_div = block.find_element(By.CSS_SELECTOR, "div[class^='Item_characteristics__text__']")
                    label_text = label_div.get_attribute("textContent").strip().lower()
                    value_text = value_div.get_attribute("textContent").strip()
                    if label_text == "serving size":
                        serving_size = value_text
                    elif label_text == "calories per serving":
                        calories_per_serving = value_text
                except:
                    pass
        except:
            pass
        product_data["serving_size"] = serving_size
        product_data["calories_per_serving"] = calories_per_serving

        # 6) Scrape "Serves about" text by scanning table rows for a <th> that has "serves" in it
        serves_about = ""
        try:
            table_rows = driver.find_elements(By.CSS_SELECTOR, "tr[class^='Item_table__row__']")
            for row in table_rows:
                th_elems = row.find_elements(By.TAG_NAME, "th")
                for th in th_elems:
                    text_val = th.get_attribute("textContent").strip()
                    # Only rely on the substring "serves"
                    if "serves" in text_val.lower():
                        serves_about = text_val
                        break
                if serves_about:
                    break
        except:
            pass
        product_data["serves_about"] = serves_about

        # 7) Extract nutrition table rows, storing smallest numeric values if duplicates appear
        nutrition = {}
        try:
            row_elements = driver.find_elements(By.CSS_SELECTOR, "tr[class^='Item_table__row__']")
            for row in row_elements:
                cells = row.find_elements(By.CSS_SELECTOR, "td[class^='Item_table__cell__']")
                if len(cells) >= 2:
                    nutrient = cells[0].get_attribute("textContent").strip()
                    value = cells[1].get_attribute("textContent").strip()
                    if nutrient:
                        if nutrient in nutrition:
                            old_val = nutrition[nutrient]
                            new_num = extract_numeric(value)
                            old_num = extract_numeric(old_val)
                            if new_num is not None and old_num is not None:
                                # Keep the smaller numeric value
                                if new_num < old_num:
                                    nutrition[nutrient] = value
                            elif new_num is not None:
                                # If old value wasn't numeric but the new one is, override
                                nutrition[nutrient] = value
                            # If new_num is None, do nothing
                        else:
                            nutrition[nutrient] = value
        except Exception:
            pass
        
        # Merge nutrition fields into product data
        product_data.update(nutrition)
        
    finally:
        driver.quit()
    
    return product_data

if __name__ == "__main__":
    # Reads CSV with product URLs.
    urls_csv = "trader_joes_products.csv"
    urls_df = pd.read_csv(urls_csv)
    
    # Range of products to scrape (by index)
    start_index = 1
    end_index = 1754
    subset_df = urls_df.iloc[start_index:end_index]
    
    # List to store scraped product details.
    details_list = []
    
    for idx, row in subset_df.iterrows():
        url = row["url"]
        print(f"Scraping product at index {idx}: {url}")
        try:
            product_details = scrape_traderjoes_product(url)
            details_list.append(product_details)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    
    # Convert scraped details to a DataFrame.
    new_details_df = pd.DataFrame(details_list)
    
    # Merge with existing product details (if any), to account for new columns
    details_csv = "NEW_trader_joes_product_details.csv"
    try:
        existing_df = pd.read_csv(details_csv)
        combined_df = pd.concat([existing_df, new_details_df], ignore_index=True, sort=False)
    except FileNotFoundError:
        combined_df = new_details_df
    
    # Write the updated data back to the CSV.
    combined_df.to_csv(details_csv, index=False)
    print(f"Scraping complete. Data appended to {details_csv}")


Collecting selenium
  Using cached selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Using cached trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Using cached wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Using cached selenium-4.29.0-py3-none-any.whl (9.5 MB)
Using cached trio-0.29.0-py3-none-any.whl (492 kB)
Using cached trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Using cached outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Using cached wsproto-1.2.0-py3-none-any.whl (24 kB)
Using cached sortedcontainers-2.4.0-