In [2]:
import requests
from bs4 import BeautifulSoup
import os
import time

In [3]:
base_page = "https://www.loblaws.ca/en/food/pantry/cereal-breakfast/c/28183"

In [4]:
#get HTML using requests
html_get = requests.get(base_page)
html_text = html_get.text
soup = BeautifulSoup(html_text, 'html.parser')

main_content = soup.find('div', {'data-testid': 'listing-page-container'})
product_grid = main_content.find('div', {'data-testid': 'product-grid-component'})


In [5]:
#find all page links in main_content
# must have an Aria-label attribute
page_links = main_content.find_all('a', href=True, attrs={'aria-label': True})
#extract href attribute from each link
page_hrefs = [link['href'] for link in page_links]
#delete duplicates by converting to a set and back to a list
page_hrefs = list(set(page_hrefs))

#remove ?page=1 from hrefs, we already have that
page_hrefs = [href for href in page_hrefs if href != '?page=1']

additional_page_urls = [base_page + href for href in page_hrefs]

print("Additional page URLs:")
for url in additional_page_urls:
    print(url)

Additional page URLs:
https://www.loblaws.ca/en/food/pantry/cereal-breakfast/c/28183?page=3
https://www.loblaws.ca/en/food/pantry/cereal-breakfast/c/28183?page=6
https://www.loblaws.ca/en/food/pantry/cereal-breakfast/c/28183?page=2
https://www.loblaws.ca/en/food/pantry/cereal-breakfast/c/28183?page=4
https://www.loblaws.ca/en/food/pantry/cereal-breakfast/c/28183?page=5


In [6]:
product_grids = [product_grid]
for page_url in additional_page_urls:
    html_get = requests.get(page_url)
    html_text = html_get.text
    soup = BeautifulSoup(html_text, 'html.parser')
    main_content = soup.find('div', {'data-testid': 'listing-page-container'})
    product_grid = main_content.find('div', {'data-testid': 'product-grid-component'})
    product_grids.append(product_grid)
    time.sleep(0.1)  # Be polite and avoid overwhelming the server

In [7]:
# Find all elements with a specific data-testid
def get_element_by_testid(element, testid):
    """Find an element by data-testid, regardless of tag or structure."""
    found_element = element.find(attrs={'data-testid': testid})
    
    #split by $ for x_price elements
    if found_element and '$' in found_element.text:
        return found_element.text.split('$')[-1].strip()
    
    return found_element.text if found_element else None

def get_image_src(element):
    """Extract the src attribute from the <img> tag."""
    image_tag = element.find('img')  # Find the first <img> tag
    return image_tag['src'] if image_tag and 'src' in image_tag.attrs else None

def process_element(el):
    """Extract regular and sale prices from the given element."""
    
    regular_price = get_element_by_testid(el, 'regular-price')
    sale_price = get_element_by_testid(el, 'sale-price')
    
    non_members_price = get_element_by_testid(el, 'non-members-price')
    if non_members_price:
        was_price = None
    else:
        was_price = get_element_by_testid(el, 'was-price')
    product_brand = get_element_by_testid(el, 'product-brand')
    product_title = get_element_by_testid(el, 'product-title')
    product_image = get_image_src(el)
    
    if sale_price and ' MIN ' in sale_price:
        sale_details = sale_price.split(' MIN ')
        sale_price = sale_details[0]
        min_quantity = sale_details[1]
    else:
        min_quantity = 1

    details = {
        "product_brand": product_brand,
        "product_title": product_title,
        "current_price": float(sale_price) if sale_price else float(regular_price) if regular_price else None,
        "min_quantity": int(min_quantity),
        "regular_price": float(regular_price) if regular_price else None,
        "sale_price": float(sale_price) if sale_price else None,
        "was_price": float(was_price) if was_price else None,
        "non_members_price": float(non_members_price) if non_members_price else None,
        "product_image": product_image
    }

    return details

In [8]:
#create dataframe to hold results
import pandas as pd
results = []

for grid in product_grids:
    results.extend(process_element(element) for element in grid)

# Make an index column

df = pd.DataFrame(results)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,product_brand,product_title,current_price,min_quantity,regular_price,sale_price,was_price,non_members_price,product_image
0,0,General Mills,"Honey Nut Cheerios Breakfast Cereal, Whole Grains",3.0,1,,3.0,4.5,,https://digital.loblaws.ca/PCX/21104047_EA/en/...
1,1,Post,Shreddies Original Cereals,5.49,1,5.49,,,,https://digital.loblaws.ca/PCX/21427214_EA/en/...
2,2,Kelloggs,Rice Krispies Cereal,4.5,2,,4.5,,5.79,https://digital.loblaws.ca/PCX/21450814_EA/en/...
3,3,General Mills,Cheerios Whole Grain Breakfast Cereal Limited ...,3.0,1,,3.0,4.5,,https://digital.loblaws.ca/PCX/21104173_EA/en/...
4,4,Kelloggs,Rice Krispies Original,6.0,1,,6.0,7.99,,https://digital.loblaws.ca/PCX/21449784_EA/en/...


In [9]:
#Save dataframe to CSV
df.to_csv('product_info.csv', index=False)

In [40]:
#Get the images from the URLs and save them locally to indexed row numbers
image_dir = 'product_images'
os.makedirs(image_dir, exist_ok=True)
for idx, row in df.iterrows():  # Limit to first 5 images for testing
    image_url = row['product_image']
    if image_url:
        image_response = requests.get(image_url)
        if image_response.status_code == 200:
            image_path = os.path.join(image_dir, f"{idx}.png")
            with open(image_path, 'wb') as img_file:
                img_file.write(image_response.content)
    time.sleep(0.1)  # Be polite to the server