⚙️ **SETUP**

- Ensure you are running with the `supermarkets` conda environment and that you're up to date. See [README.md](../../README.md) if you haven't set up your environment yet.

    On the command line:

    ```bash
    conda activate supermarkets
    pip install -r requirements.txt
    ```

- On VSCode, select `supermarkets` as the Python interpreter for this notebook and project.

In [1]:
import time
import pandas as pd

from pprint import pprint
from tqdm.notebook import tqdm

from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
# Add the utilscraping module to the path
import sys
sys.path.append('../src/python/')

from utilscraping import safely_get_elements, safely_locate_element, safely_click_element

**CONSTANTS** 

In [3]:
WAITROSE_URL = 'http://www.waitrose.com/ecom/shop/browse/groceries'

# 1. Get Categories of Grocery Products 

In [13]:
driver = webdriver.Firefox()

In [17]:
driver.get(WAITROSE_URL)
driver.execute_script("document.body.style.zoom='30%'")

In [18]:
safely_click_element(driver, '//button[contains(@data-testid, "reject-all")]/span')

safely_click_element(driver, '//button[contains(@data-testid, "expand-button")]/span')

⚠️ **Remember to close the cookies banner manually before running the code below.**

In [20]:
containers_categories = safely_get_elements(driver, "#subcategoryList > li > a")

categories = {
    a_element.get_attribute("aria-label"): a_element.get_attribute("href")
    for a_element in containers_categories
}

from pprint import pprint
pprint(categories)

{'Baby, Child & Parent': 'https://www.waitrose.com/ecom/shop/browse/groceries/baby_child_and_parent',
 'Bakery': 'https://www.waitrose.com/ecom/shop/browse/groceries/bakery',
 'Beer, Wine & Spirits': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits',
 'Best of British': 'https://www.waitrose.com/ecom/shop/browse/groceries/best_of_british',
 'Dietary & Lifestyle': 'https://www.waitrose.com/ecom/shop/browse/groceries/dietary_and_lifestyle',
 'Everyday Value': 'https://www.waitrose.com/ecom/shop/browse/groceries/everyday_value',
 'Food Cupboard': 'https://www.waitrose.com/ecom/shop/browse/groceries/food_cupboard',
 'Fresh & Chilled': 'https://www.waitrose.com/ecom/shop/browse/groceries/fresh_and_chilled',
 'Frozen': 'https://www.waitrose.com/ecom/shop/browse/groceries/frozen',
 'Groceries Offers': 'https://www.waitrose.com/ecom/shop/browse/offers',
 'Home': 'https://www.waitrose.com/ecom/shop/browse/groceries/home',
 'Household': 'https://www.waitrose.com/ecom/sh

# 2. Explore how to get all products from a given category

In [17]:
# Go to single category
driver.get(categories["Summer"])
driver.execute_script("document.body.style.zoom='30%'")

In [18]:
# Click on Load More until you can't anymore
waiting_modifier = 5

load_more_button_selector = "//span[contains(text(), 'Load more...')]"
while safely_locate_element(driver, load_more_button_selector, wait_time=waiting_modifier*3):
    print("Loading more...")
    safely_click_element(driver, load_more_button_selector, wait_time=waiting_modifier*3)

Loading more...
Loading more...
Loading more...
Loading more...
Loading more...


Grab products:

In [19]:
# Specify the xpath to get all <article> elements with data-test-id="product-pod"
products_xpath = "//article[contains(@data-testid, 'product-pod')]"

# Get all the product elements
product_elements = safely_get_elements(driver, products_xpath, selector_type=By.XPATH, wait_time=0)

Retrying to get element... Attempt 1
Retrying to get element... Attempt 2
Failed to get element: Message: 



In [20]:
len(product_elements)

0

Collect attributes for each product:

In [36]:
article_attrs = [
    "data-product-id",
    "data-product-name",
    "data-product-type",
    "data-product-on-offer",
    "data-product-index",
]

Details of each product:

In [53]:
{attr: product_elements[0].get_attribute(attr) for attr in article_attrs}

{'data-testid': 'product-pod',
 'data-product-id': '525635',
 'data-product-name': 'Organix Raspberry & Apple Soft Oaty Bars',
 'data-product-type': 'G',
 'data-product-on-offer': 'false',
 'data-product-index': '1'}

Deeper into the tree:

In [56]:
## Image info

container_img = product_elements[0].find_element(By.XPATH, "//*[contains(@data-testid, 'product-pod-image')]")
img_link = container_img.find_element(By.TAG_NAME, "img").get_attribute("src")
img_link

'https://ecom-su-static-prod.wtrecom.com/images/products/3/LN_525635_BP_3.jpg'

In [71]:
## Product Details

container_details = product_elements[0].find_element(By.XPATH, "//*[contains(@data-testid, 'product-pod-header')]")

dict(product_page=container_details.find_element(By.TAG_NAME, "a").get_attribute("href"),
     product_name=container_details.find_element(By.CSS_SELECTOR, "h2").text,
     product_size=container_details.find_element(By.CSS_SELECTOR, "a > span").text)

# pprint(container_details.get_property("outerHTML"))

{'product_page': 'https://www.waitrose.com/ecom/products/organix-raspberry-apple-soft-oaty-bars/525635-812878-812879',
 'product_name': 'Organix Raspberry & Apple Soft Oaty Bars',
 'product_size': '6x23g'}

In [117]:
## Product price

container_price = product_elements[0].find_element(By.XPATH, "//*[contains(@data-testid, 'product-pod-prices-section')]")

# Extract item price and pricer per unit
{el.text.split("\n")[0]: el.text.split("\n")[1] for el in container_price.find_elements(By.XPATH, "span[p]")}


{'Item price': '£3.15', 'Price per unit': '£2.29/100g'}

# 3. Putting it all together

In [27]:
driver.quit()

In [4]:
article_attrs = [
    "data-product-id",
    "data-product-name",
    "data-product-type",
    "data-product-on-offer",
    "data-product-index",
]

In [5]:
driver = webdriver.Firefox()
driver.get(WAITROSE_URL)
driver.execute_script("document.body.style.zoom='30%'")

# Reject cookies and expand categories
safely_click_element(driver, '//button[contains(@data-testid, "reject-all")]/span')
safely_click_element(driver, '//button[contains(@data-testid, "expand-button")]/span')

# ⚠️ **Remember to close the cookies banner manually before running the code below.**
containers_categories = safely_get_elements(driver, "#subcategoryList > li > a")

categories = {
    a_element.get_attribute("aria-label"): a_element.get_attribute("href")
    for a_element in containers_categories
}

from pprint import pprint
pprint(categories)

{'Baby, Child & Parent': 'https://www.waitrose.com/ecom/shop/browse/groceries/baby_child_and_parent',
 'Bakery': 'https://www.waitrose.com/ecom/shop/browse/groceries/bakery',
 'Beer, Wine & Spirits': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits',
 'Best of British': 'https://www.waitrose.com/ecom/shop/browse/groceries/best_of_british',
 'Dietary & Lifestyle': 'https://www.waitrose.com/ecom/shop/browse/groceries/dietary_and_lifestyle',
 'Everyday Value': 'https://www.waitrose.com/ecom/shop/browse/groceries/everyday_value',
 'Food Cupboard': 'https://www.waitrose.com/ecom/shop/browse/groceries/food_cupboard',
 'Fresh & Chilled': 'https://www.waitrose.com/ecom/shop/browse/groceries/fresh_and_chilled',
 'Frozen': 'https://www.waitrose.com/ecom/shop/browse/groceries/frozen',
 'Groceries Offers': 'https://www.waitrose.com/ecom/shop/browse/offers',
 'Home': 'https://www.waitrose.com/ecom/shop/browse/groceries/home',
 'Household': 'https://www.waitrose.com/ecom/sh

In [11]:
def get_product_info(product_element):

    result = dict()

    # Metadata
    metadata_info = {
        attr: product_element.get_attribute(attr) for attr in article_attrs
    }
    result.update(metadata_info)
    # pprint(result)


    # Image link
    img_xpath = ".//*[contains(@data-testid, 'product-pod-image')]//img"
    # pprint(product_element.find_element(By.XPATH, img_xpath).get_attribute("outerHTML"))
    img_link = product_element.find_element(By.XPATH, img_xpath).get_attribute("src")
    result.update({"image-url": img_link})
    # pprint(result)

    # Product details
    details_xpath = ".//*[contains(@data-testid, 'product-pod-header')]"
    # pprint(product_element.find_element(By.XPATH, details_xpath).get_attribute("outerHTML"))
    div_details = product_element.find_element(By.XPATH, details_xpath)

    prod_details = {
        "product-page": div_details.find_element(By.TAG_NAME, "a").get_attribute("href"),
        "product-name": div_details.find_element(By.CSS_SELECTOR, "h2").text,
        "product-size": div_details.find_element(By.CSS_SELECTOR, "a > span").text
    }
    result.update(prod_details)

    # Price
    price_xpath = ".//*[contains(@data-testid, 'product-pod-prices-section')]"
    container_price = product_element.find_element(By.XPATH, price_xpath)

    # Extract item price and pricer per unit
    price_info = {
        el.text.split("\n")[0]: el.text.split("\n")[1]
        for el in container_price.find_elements(By.XPATH, "span[p]")
        if "\n" in el.text
    }
    if not price_info:
        price_info = {
            "item-price": None,
            "price-per-unit": None
        }

    # Rename 'Item price' to item-price and 'Price per unit' to price-per-unit
    price_info = {k.replace(" ", "-").lower(): v for k, v in price_info.items()}
    result.update(price_info)
    
    # Ofer Description
    if result["data-product-on-offer"] == "true":
        offer_xpath = ".//*[contains(@data-testid, 'product-pod-promotion')]//a/p/span"
        offer_description = product_element.find_element(By.XPATH, offer_xpath).get_attribute("innerText")
    else:
        offer_description = None
    result.update({"offer-description": offer_description})

    return result

# all_products_info = [get_product_info(product) for product in tqdm(product_elements)]

In [14]:
def collect_from_category(driver, category, url, waiting_modifier = 5):
    print(f"Navigating to page related to category: {category}")
    driver.get(url)
    driver.execute_script("document.body.style.zoom='30%'")
    driver.execute_script("arguments[0].scrollIntoView();", driver.find_element(By.TAG_NAME, "footer"))

    # TODO: Make it a default parameter of the function
    load_more_button_selector = "//span[contains(text(), 'Load more...')]"
    loading_times = 0
    while safely_locate_element(driver, load_more_button_selector, wait_time=waiting_modifier*3):
        if loading_times == 0:
            print("  Loading more... |", end="")
        else:
            # Just add a | to the same line:
            print(" |", end="")
        loading_times += 1
        safely_click_element(driver, load_more_button_selector, wait_time=waiting_modifier*3)
        driver.execute_script("arguments[0].scrollIntoView();", driver.find_element(By.TAG_NAME, "footer"))
    print()

    # Get all the product elements
    print("Done loading. Collecting products... 🛒")

    # Specify the xpath to get all <article> elements with data-test-id="product-pod"
    # TODO: Make it a default parameter of the function
    products_xpath = ".//article[contains(@data-testid, 'product-pod')]"

    # Get all the product elements
    product_elements = driver.find_elements(By.XPATH, products_xpath)

    all_products_info = [get_product_info(product) for product in tqdm(product_elements, desc=f"Collecting products from {category}")]
    all_products_info = pd.DataFrame(all_products_info)
    all_products_info["category"] = category

    filename = category.replace(" ", "-").replace(",", "-").replace("&","and").lower()
    all_products_info.to_csv(f"../../data/{filename}.csv", index=False)

    return all_products_info

In [125]:
# Select just two categories for now
# categories = {k: v for k, v in categories.items() if k in ["Pet"]}

In [None]:
df = [
    collect_from_category(driver, category, url)
    for category, url in tqdm(categories.items(), desc="Categories")
    if category == "Groceries Offers"
]

In [15]:
df = pd.concat(df)

In [19]:
df.describe().T

Unnamed: 0,count,unique,top,freq
data-product-id,25418,16055,014520,6
data-product-name,25418,15419,Evian Still Mineral Water,13
data-product-type,25418,3,G,24377
data-product-on-offer,25418,2,false,18217
data-product-index,25408,4189,86,20
image-url,25418,16055,https://ecom-su-static-prod.wtrecom.com/images...,6
product-page,25418,16055,https://www.waitrose.com/ecom/products/duchy-o...,6
product-name,25418,15419,Evian Still Mineral Water,13
product-size,25418,1541,each,867
item-price,25407,774,£2.50,948


In [159]:
df[df['data-product-on-offer'] == 'true']['offer-description']

10        save 30p. Was £1.50
11        save 45p. Was £2.45
12               Add 3 for £2
13               Add 3 for £3
17               Add 2 for £9
                ...          
359             Add 2 for £13
360       save 50p. Was £3.50
365              Add 2 for £9
369             Add 2 for £13
373    save £3.50. Was £18.50
Name: offer-description, Length: 127, dtype: object