In [1]:
import os
import re
import time
import warnings
import requests
import geocoder
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as soup
from IPython.display import display_html
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
warnings.filterwarnings('ignore')

In [2]:
SHOPRITE_SEARCH_URL = 'https://www.shoprite.co.za/search/all?q='
SHOPRITE_URL = 'https://www.shoprite.co.za'

CHECKERS_SEARCH_URL = 'https://www.checkers.co.za/search/all?q='
CHECKERS_URL = 'https://www.checkers.co.za/'

WOOLWORTHS_SEARCH_URL = 'https://www.woolworths.co.za/cat?Ntt='
WOOLWORTHS_URL = 'https://www.woolworths.co.za/'

STORE_LIST = [SHOPRITE_URL, CHECKERS_URL]

In [3]:
FOOD_BASKET = ['large eggs 6', 'large eggs 18', 'extra large eggs 30', 'table salt 500g',
               'table salt 1kg', 'rice 2kg', 'rice 5kg', 'rice 1kg', 'rice canister 10l', 
              'sugar 500g', 'sugar 1kg', 'sugar 2.5kg', 'sugar 5kg', 'sugar 10kg',
              'flour 500g', 'flour 1kg', 'flour 2.5kg', 'flour 5kg', 'flour 10kg',
              'frozen chicken', 'pork bangers', 'pork rashers', 'pork loin chops',
              'pork braai chops', 'pork chops', 'pork shoulder ribs', 'stewing pork',
              'beef goulash', 'ground beef', 'beef parcel', 'beef brisket', 'coarse salt 500g',
              'fine salt 500g', 'medium salt 500g']

In [4]:
user_basket = np.random.choice(FOOD_BASKET, 5)

In [5]:
CITY = 'cape town'

In [32]:
def choose_location(url, location=CITY):
    
    g = geocoder.ip('me')
    
    if g.country == 'ZA':
        loc = g.city
        
    else:
        loc = location
        
    options = Options()
    options.headless = False
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(10)
    try:
        driver.get(url)
    except Exception:
        print('waiting for full page load')
        time.sleep(30)
    
    try:
        main = driver.find_element(By.TAG_NAME, 'main')
        body = driver.find_element(By.TAG_NAME, 'body')
        elem = driver.find_element(By.CLASS_NAME, 'header__top__links')
        elem = driver.find_element(By.CLASS_NAME, 'header__your-store')
        elem.click()
        elem = driver.find_element(By.ID, 'storeFinderInput')
        elem.send_keys(loc)
        elem.send_keys(Keys.RETURN)

        elem = driver.find_elements(By.CLASS_NAME, 'nav-store-your-results')
        stores = []

        for e in elem:
            store = e.find_element(By.TAG_NAME, 'a')
            stores.append(store.get_attribute('innerHTML').strip())

        print(stores)
        SELECTION = input('Which store is nearest to you? >')

        elem = driver.find_element(By.CLASS_NAME, 'header__your-store')

        elem.click()

        elem = driver.find_element(By.ID, 'storeFinderInput')

        elem.clear()
        elem.send_keys(SELECTION)
        elem.send_keys(Keys.RETURN)
        time.sleep(5)
        elem = driver.find_element(By.CLASS_NAME, 'nav-store-your-results')
        elem = elem.find_element(By.CLASS_NAME, 'sl-container')
        button = elem.find_element(By.TAG_NAME, 'button')
        button.click()
        print("setting your region")
        time.sleep(10)
        
        return driver
    
    except Exception:
        
        return driver      

In [7]:
def fetch_html(url, driver):
    # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    # response = requests.get(url, verify=False, headers=headers, timeout=10)
    driver.get(url)
    page = soup(driver.page_source, 'html.parser')
    print("html downloaded successfully")

    return page

In [8]:
def extract_product_links(html_page):
  
    product_div = html_page.find('div', {"class": re.compile('(\w+-)*search-landing')})
    content = product_div.find('div', class_='yCmsContentSlot')
    component = content.find('div', class_='yCmsComponent')
    product_wrappers = component.find('div', class_='search-landing__block--products')
    product_row = product_wrappers.find('div', class_='row')
    products = product_row.findAll('div', class_='product-frame')
    
    links = []
    for product in products:
        figure = product.find('figure')
        product_link = figure.find('a', href=True)['href']
        links.append(product_link)
    
    return links

In [9]:
def match_product_url(links_arr, item):
    
    pattern = r'([\w+-/%?]+' + str(item.replace(' ', '\-')) + '[\w+-/%?]+)'

    found = []

    for link in links_arr:
        try:
            match = re.search(pattern, link.lower()).group()
            found.append({link: match})
        except Exception:
            None
    
    return found

In [10]:
def extract_brand(link):
    
    text = list(link.values())[0].split('/')[-3]
    
    try:
        value = re.search(r'(\d+(\-)?\d+)', text).group()
        new_link = re.sub(value, value.replace('-', '.'), text)
        
    except Exception:
        new_link = text
    
    
    return new_link.replace('-', ' ').title()

In [11]:
def get_brands(arr_links):
    
    brands = []
    counter = 1
    
    for item in arr_links:
        brands.append({counter: extract_brand(item)})
        counter+=1
        
    return brands

In [12]:
def evaluate_user_choice(user_choice, links, matching_product_urls, brands, item, store_url, driver):
    
    html = ""
    for brand in brands:
        for key, value in brand.items():
            if eval(user_choice) == key:
                k_found = match_product_url(list(matching_product_urls[0].keys()), item)
                if len(k_found) == 0:
                    k_found = closest_products(links, matching_product_urls)
                url = list(k_found[0].keys())[0]
                k_fetch = fetch_html(store_url+url, driver)
                html = k_fetch
                
    return html

In [13]:
def get_product_details(html):
    product_specs = {}
    
    product_div = html.find('div', class_='pdp')
    product_details = product_div.find('div', class_='pdp__details')
    regex = r'\d+\.\d+'
    price = re.search(regex, product_details.find('div', class_='special-price__price').find('span').get_text().strip().lstrip('R')).group()

    try:
        promo_price = product_details.find('div', class_='special-price__extra').find('span').get_text().strip().lstrip('R')
        promo_condition = product_details.find('span', class_='special-price__extra__text').get_text()
    except Exception:
        promo_price = None
        promo_condition = None
        
    try:
        extra_details = product_div.find('div', 'pdp__extras')
        offer = extra_details.find('div', class_= 'extra-message').find('span', class_='extra-message__title').get_text().strip()
        promo_price_1 = re.search(regex, offer).group()
        promo_duration = extra_details.find('div', class_= 'extra-message').find('span', class_='extra-message__valid').get_text().strip().replace('&nbsp;' '')
    except Exception:
        offer = None
        promo_price_1 = None
        promo_duration = None
    
    if not promo_price and not promo_price_1:
        promo_price = None
        
    elif not promo_price and promo_price_1:
        promo_price = promo_price_1
    
    else:
        promo_price = promo_price
        
    product_desc = product_details.find('div', class_='pdp__description').get_text()
    product_name = product_details.find('h1', class_='pdp__name').get_text()
    product_brand = product_name.split()[0]
    product_image_url = SHOPRITE_URL + product_div.find('div', class_='pdp__image').find('img')['src']

    product_specs['Brand'] = product_brand
    product_specs['Name'] = product_name + "**" if promo_price and (price > promo_price) else product_name
    product_specs['Description'] = product_desc
    product_specs['Regular Price'] = price
    product_specs['Offer'] = offer
    product_specs['Promo Price'] = promo_price
    product_specs['Promo Condition'] = promo_condition
    product_specs['Product Image'] = product_image_url
    
    return product_specs

In [14]:
def total_cost(basket_details):
    
    receipt = {}
    total_cost = 0
    regular_total = 0

    for item in basket_details:
        if item['Promo Price'] and (item['Promo Price'] < item['Regular Price']):
            price = item['Promo Price']
        else:
            price = item['Regular Price']

        total_cost += float(price)
        regular_total += float(item['Regular Price'])
        
        receipt[item['Name']] =  (price, item['Regular Price'], item['Promo Condition'])
    
    receipt['Total'] = ("R " + str(round(total_cost,2)), "R " + str(round(regular_total, 2)), "-")
    receipt['Savings'] = (f"R {round(abs(total_cost - regular_total), 2)}", "-", "-")
    
    return receipt
    

In [15]:
def closest_products(product_links, product_match):
    all_matches = []
    key = product_links
    value = [link.lower() for link in product_links]
    
    for i in range(len(key)):
        all_matches.append(dict(zip([key[i]], [value[i]])))
    product_match = all_matches
    
    return product_match

In [16]:
def evaluate_user_basket(basket, search_url, store_url, driver):
    user_product_details = []

    for item in basket:
        search_result = fetch_html(search_url+item, driver)
        product_links = extract_product_links(search_result)
        product_match = match_product_url(product_links, item)
        if len(product_match) == 0:
            product_match = closest_products(product_links, product_match)
        product_brands = get_brands(product_match)

        print(product_brands)

        user_choice = input('select brand > ')
        html = evaluate_user_choice(user_choice, product_links, product_match, product_brands, item, store_url, driver)
        product_details = get_product_details(html)

        user_product_details.append(product_details)

        basket_cost = total_cost(user_product_details)
        
    return basket_cost

In [17]:
def get_receipt(receipt):
    receipt = pd.DataFrame.from_dict(receipt, orient='index',
                                    columns=['Price', 'Regular Price', 'Condition'])

    return receipt

In [18]:
def compare_stores(stores, user_basket, driver):
    receipts = []
    promos = {}
    for store in stores:
        for name, urls in store.items():
            store_receipt = evaluate_user_basket(basket=user_basket, search_url=urls[0], store_url=urls[1], driver=driver)
            receipt = get_receipt(store_receipt)
            promo_total = receipt.iloc[-2].values[0].strip('R').strip()
            promos[name] = promo_total
            receipt_styler = receipt.style\
                        .set_table_attributes("style='display:inline; margin-right:30px'")\
                        .set_caption(name.upper() + ' RECEIPT')
            
            receipts.append(receipt_styler)
    print("#"*120)
    display_html([tbl._repr_html_() for tbl in receipts], raw=True)
    
    min_total = min(promos.values())
    for store, total in promos.items():
        if total == min_total:
            return (f" WE RECOMMEND SHOPPING FROM {store} ".upper().center(100, "*"))

In [19]:
stores = [{"checkers": (CHECKERS_SEARCH_URL, CHECKERS_URL),
          "shoprite": (SHOPRITE_SEARCH_URL, SHOPRITE_URL)}]

In [20]:
for store in STORE_LIST:
    driver = choose_location(url=store)

['Checkers Foods CF Westlake', 'Checkers Constantia', 'Checkers Table View', 'Checkers Bothasig', 'Checkers Table Bay Mall']


Which store is nearest to you? > checkers table view


In [21]:
compare_stores(stores, user_basket, driver)

html downloaded successfully


AttributeError: 'NoneType' object has no attribute 'find'

## PENDING INCLUSION

In [26]:
driver.get(CHECKERS_SEARCH_URL+'sugar 1kg')

In [27]:
driver.page_source

'<html lang="en"><head>\n    <title>ShieldSquare Captcha</title>\n    <meta charset="utf-8">\n<meta name="viewport" content="width=device-width, initial-scale=1">\n<link rel="icon" href="https://captcha.perfdrive.com/captcha-public/images/favicon.png">\n<link rel="stylesheet" href="https://captcha.perfdrive.com/captcha-public/css/shieldsquare_styles.min.css">\n    <script async="" src="https://cdn.perfdrive.com/aperture/aperture.js"></script><script type="text/javascript">\n\twindow.SSJSInternal = 22377;\n\n    var __uzdbm_1 = "ea19d0ff-6041-b80e-1d79-127942d3ffea";\n    var __uzdbm_2 = "OTQwMTZmMmYtYmtsYi0yZDUzLTBmOWUtZjRmNjc1NGMyODE0JDE5Ny4yMTAuNTIuODg=";\n\n\t(function(w, d, e, u, c, g, a, b){\n\t\tw["SSJSConnectorObj"] = w["SSJSConnectorObj"] || {ss_cid : c, domain_info: g};\n\t\ta = d.createElement(e);\n\t\ta.async = true;\n\t\ta.src = u;\n\t\tb = d.getElementsByTagName(e)[0];\n\t\tb.parentNode.insertBefore(a, b);\n\t})(window,document,"script","https://cdn.perfdrive.com/aperture/

In [None]:
page = soup(d.page_source, 'html.parser')
product_div = page.find('div', {"class": re.compile('(\w+-)*search-landing')})
content = product_div.find('div', class_='yCmsContentSlot')
component = content.find('div', class_='yCmsComponent')
product_wrappers = component.find('div', class_='search-landing__block--products')
product_row = product_wrappers.find('div', class_='row')
products = product_row.findAll('div', class_='product-frame')

links = []
for product in products:
    figure = product.find('figure')
    product_link = figure.find('a', href=True)['href']
    links.append(product_link)

In [352]:
links

['/All-Departments/Food/Food-Cupboard/Cooking-Ingredients/Salt-and-Pepper/Suchet%27s-Iodated-Salt-1kg/p/10138586EA',
 '/All-Departments/Household/Cleaning/Dishwashing/Dishwasher-Salt/Marina-Dishwasher-Salt-1kg/p/10124010EA',
 '/All-Departments/Food/Food-Cupboard/Cooking-Ingredients/Salt-and-Pepper/Seepo-Fine-Salt-1kg/p/10138295EA',
 '/All-Departments/Food/Food-Cupboard/Cooking-Ingredients/Salt-and-Pepper/Cerebos-Iodated-Table-Salt-1kg/p/10140757EA',
 '/All-Departments/Food/Food-Cupboard/Cooking-Ingredients/Salt-and-Pepper/Cerebos-Iodated-Sea-Salt-1KG-Pack/p/10143950EA',
 '/All-Departments/Food/Food-Cupboard/Cooking-Ingredients/Salt-and-Pepper/Checkers-Housebrand-Table-Salt-1kg/p/10157633EA',
 '/c/Royal-Coarse-Dishwasher-Salt-1kg/p/10135686EA',
 '/All-Departments/Food/Food-Cupboard/Cooking-Ingredients/Salt-and-Pepper/Salnova-Iodated-Coarse-Sea-Salt-1kg/p/10132758EA',
 '/All-Departments/Household/Cleaning/Dishwashing/Salgo-Refined-Refined-Coarse-Dishwasher-Salt-1kg/p/10239040EA',
 '/All-

In [38]:
driver.close()

In [39]:
driver = choose_location(url=CHECKERS_URL)

['Checkers Foods CF Westlake', 'Checkers Constantia', 'Checkers Table View', 'Checkers Bothasig', 'Checkers Table Bay Mall']


Which store is nearest to you? > Checkers Table Bay Mall


setting your region


In [40]:
driver.get(CHECKERS_SEARCH_URL+'salt 1kg')

In [42]:
page_soup = soup(driver.page_source, 'html.parser')

In [43]:
items = page_soup.find_all('div', class_='item-product')

In [63]:
products_list = []

for item in items:
    products = {}
    
    price = item.find('div', class_='special-price__price').get_text().strip('\n').strip().lstrip('R')
    name = item.find('h3', class_='item-product__name').get_text().strip()
    availability = item.find('div', class_='js-available-in-store').get_text()
    
    products['Name'] = name
    products['Price'] = price
    products['Availability'] = availability
    
    products_list.append(products)

products_list

[{'Name': "Suchet's Iodated Salt 1kg", 'Price': '19.99', 'Availability': ' '},
 {'Name': 'Marina Dishwasher Salt 1kg', 'Price': '15.99', 'Availability': ' '},
 {'Name': 'Seepo Fine Salt 1kg', 'Price': '11.99', 'Availability': ' '},
 {'Name': 'Cerebos Iodated Table Salt 1kg',
  'Price': '46.99',
  'Availability': ' '},
 {'Name': 'Cerebos Iodated Sea Salt 1KG Pack',
  'Price': '19.99',
  'Availability': ' '},
 {'Name': 'Checkers Housebrand Table Salt 1kg',
  'Price': '11.99',
  'Availability': ' '},
 {'Name': 'Royal Coarse Dishwasher Salt 1kg',
  'Price': '10.99',
  'Availability': ' '},
 {'Name': 'Salnova Iodated Coarse Sea Salt 1kg',
  'Price': '19.99',
  'Availability': ' '},
 {'Name': 'Salgo Refined Refined Coarse Dishwasher Salt 1kg',
  'Price': '19.99',
  'Availability': ' '},
 {'Name': 'Cerebos Iodated Table Salt Pack 1kg',
  'Price': '19.99',
  'Availability': ' '}]