## The crawler below is written to extract data from https://mysupermarketcompare.co.uk/

### I am going to use functional programming for this project
- First of all the necessary libraries will be imported
- Helper functions will be written to make the process easier and in support of refactoring
- A save function will be written to save the file in json
- Pandas will be used to create a DataFrame so as to easily export file to other formats and prepare it for analysis

### Importing necessary libraries

In [26]:
from bs4 import BeautifulSoup as bs
import requests
from datetime import datetime
import re
import json
import pandas as pd

### The get_last_updated() function extracts and cleans the date a particular product was last updated

In [25]:
def get_last_updated(date):
    split = date.get_text().split('n')[1].split(' ')[1:4]
    date = ' '.join(split)
    from datetime import datetime
    py_date = datetime.strptime(date, '%b %d, %Y')
    return datetime.strftime(py_date, '%d-%b-%y')

### The get_url() function extracts and cleans the url of the product

In [24]:
def get_url(url):
    return url.find(class_='fb').get('data-href').split('=')[-1].replace('%3A', ':').replace('%2F', '/')

### The get_price()  function extracts and formats the price of the product from the seller with the  best deal 

In [23]:
def get_price(price):
    value = price.find_all(class_='woocommerce-Price-amount amount')
    for i, j in enumerate(value):
        if i == 1:
            return j.get_text().replace('£', '')
        else:
            return j.get_text().replace('£', '')

### The get_merchant() function extracts all the sellers of a particular product 

In [22]:
def get_merchant(string):
    things = r"asda|boots|sainsbury|morrisons|tesco|superdrug"
    things_re = rf'\w*\s*({things})'
    return re.search(things_re, str(string)).group().split('_')[-1].capitalize()

In [21]:
def get_unit(unit):
    value = unit.find(class_='title_single_area').get_text().replace('&amp', '&')
    pattern = r'(\d{1,3}(ml|mm|mg|cl|g|kg|Pcs|l|cm)+)|(\d+\.\d+\s\D+)'
    if re.search(pattern, value, re.I):
        return re.search(pattern, value, re.IGNORECASE).group()
    else:
        return value.split()[-1]

   ### The get_product_details() function extracts all the product details and save them in a dictionary

In [4]:
def get_product_details(url):
    product_detail = {}
    try:
        url = requests.get(url)
    except Exception as e:
        print(e)
    soup = bs(url.content)
    try:
        product = soup.find(class_='product')
    except Exception:
        print("Could not find 'product'")
    else:
        for index, item in enumerate(product):
            if index == 1:
                product_detail['sku_id'] = 'sku_id'
                try:
                    product_detail['url_link'] = get_url(product)
                except Exception as e:
                    product_detail['url_link'] = None
                try:
                    product_detail['name'] = product.find(class_='title_single_area').get_text()
                except Exception as e:
                    product_details['name'] = None
                try:
                    product_detail['brand'] = product.find(class_='title_single_area').get_text().split()[0]
                except Exception as e:
                    product_detail['brand'] = None
                try:    
                    product_detail['unit'] = get_unit(product)
                except Exception as e:
                    product_detail['unit'] = None
                try:
                    product_detail['category'] = product.find(class_='mr5').get_text()
                except Exception as e:
                    product_detail['category'] = None
                try:
                    product_detail['last_updated'] = get_last_updated(product.find(class_='price_updated'))
                except Exception as e:
                    product_detail['last_updated'] = None
                try:
                    product_detail['price'] = get_price(product)
                except Exception as e:
                    product_detail['price'] = None
                try:
                    product_detail['description'] = product.find(class_='woocommerce-product-details__short-description').get_text()
                except Exception as e:
                    product_detail['description'] = None
                try:
                    product_detail['market_place'] = product.find(class_='compare-domain-text').get_text().split('.')[0]
                except Exception as e:
                    product_detail['market_place'] = None
                try:    
                    merchant_list = product.find_all(class_='table_merchant_list')
                except Exception as e:
                    print(e)
                else:
                    for i, j in enumerate(merchant_list):
                        try:
                            if product_detail['market_place'] == j.find(class_='merchant_thumb').get_text().split('.')[0].strip():
                                continue
                        except Exception as e:
                            print(e)
                        try:
                            product_detail[f'Market::{get_merchant(j)}::url_link'] = j.a.get('href')
                        except Exception:
                            product_detail[f'Market::{get_merchant(j)}::url_link'] = None
                        try:
                            product_detail[f'Market::{get_merchant(j)}::price'] = j.find(class_='val_sim_price').get_text().strip().replace('£', '')
                        except Exception:
                            product_detail[f'Market::{get_merchant(j)}::price'] = None
                        product_detail[f'Market::{get_merchant(j)}::sku_id'] = 'In progress'
    return product_detail

In [29]:
get_product_details('https://mysupermarketcompare.co.uk/product/heinz-classic-barbecue-sauce-665g/')

{'sku_id': 'sku_id',
 'url_link': 'https://mysupermarketcompare.co.uk/product/heinz-classic-barbecue-sauce-665g/',
 'name': 'Heinz Classic Barbecue Sauce 665g',
 'brand': 'Heinz',
 'unit': '665g',
 'category': 'Food Cupboard',
 'last_updated': None,
 'price': '2.00',
 'description': 'Heinz Classic Barbecue 665g',
 'market_place': 'asda',
 'Market::Morrisons::url_link': 'https://groceries.morrisons.com/products/heinz-classic-barbecue-sauce-407480011',
 'Market::Morrisons::price': '2.50',
 'Market::Morrisons::sku_id': 'In progress',
 'Market::Sainsbury::url_link': 'https://www.sainsburys.co.uk/gol-ui/product/bbq-sauce---marinades/heinz-classic-bbq-sauce-720g',
 'Market::Sainsbury::price': '2.50',
 'Market::Sainsbury::sku_id': 'In progress',
 'Market::Tesco::url_link': 'https://www.tesco.com/groceries/en-GB/products/300793051',
 'Market::Tesco::price': '2.50',
 'Market::Tesco::sku_id': 'In progress'}

### The get_num_pages function is a function to get the total number of pages

In [613]:
def get_num_pages(home_url):
    url = requests.get(home_url)
    soup = bs(url.content)
    return int(soup.find_all(class_='page-numbers')[-2].get_text().replace(',', ''))

### The get_all_products() function gets every product from the first page to a user defined length

In [27]:
def get_all_products(url, page_num):
    all_products = []
#     The get_num_pages function will not be used anymore because of performance issues.
#     The length of page will be user defined
#     num_pages = get_num_pages('https://mysupermarketcompare.co.uk/shop/')

    for i in range(2): #User defined length
        page = requests.get(url)
        soup = bs(page.content)
        products = soup.find_all(class_='product')
        links = [item.find(class_='text-clamp').a.get('href') for item in products]
        for link in links:
            all_products.append(get_product_details(link))
        page_num +=1
        url = f'https://mysupermarketcompare.co.uk/shop/page/{page_num}'
    return all_products

In [30]:
product_details = get_all_products('https://mysupermarketcompare.co.uk/shop/page/1', 1)

In [32]:
product_details

[{'sku_id': 'sku_id',
  'url_link': 'https://mysupermarketcompare.co.uk/product/1-marigold-extra-life-gloves-kitchen-medium-medium/',
  'name': '1 Marigold Extra Life Gloves Kitchen Medium medium',
  'brand': '1',
  'unit': 'medium',
  'category': 'Household',
  'last_updated': None,
  'price': '2.35',
  'description': 'Marigold Rubber Gloves MEDIUM',
  'market_place': 'asda',
  'Market::Sainsbury::url_link': 'https://www.sainsburys.co.uk/gol-ui/product/marigold-kitchen-gloves--medium',
  'Market::Sainsbury::price': '2.50',
  'Market::Sainsbury::sku_id': 'In progress',
  'Market::Tesco::url_link': 'https://www.tesco.com/groceries/en-GB/products/250360288',
  'Market::Tesco::price': '2.50',
  'Market::Tesco::sku_id': 'In progress'},
 {'sku_id': 'sku_id',
  'url_link': 'https://mysupermarketcompare.co.uk/product/10-motives-disposable-regular/',
  'name': '10 Motives Disposable Regular 18mg',
  'brand': '10',
  'unit': '18mg',
  'category': 'Home & Ents',
  'last_updated': None,
  'price'

In [31]:
re.search(r'(\d{1,3}(ml|mm|mg|cl|g|kg|Pcs|l|cm)+)|\d+(\.\d+)*\s\D+', '1000 Stories Zinfandel 750ml').group()

'1000 Stories Zinfandel '

### Generate SKU ID for products in their various market place

In [18]:
for index, sku in enumerate(product_details):
    sku['sku_id'] = index + get_sku_id(10000000001)
    if sku.get('Market::Sainsbury::sku_id'):
        sku['Market::Sainsbury::sku_id'] = index + get_sku_id(100000000002)
    if sku.get('Market::Tesco::sku_id'):
        sku['Market::Tesco::sku_id'] = index + get_sku_id(100000000003)
    if sku.get('Market::Asda::sku_id'):
        sku['Market::Asda::sku_id'] = index + get_sku_id(100000000004)
    if sku.get('Market::Morrisons::sku_id'):
        sku['Market::Morrisons::sku_id'] = index + get_sku_id(100000000005)
    if sku.get('Market::Boots::sku_id'):
        sku['Market::Boots::sku_id'] = index + get_sku_id(100000000006)

### A save function to save the scraped data into a file

In [22]:
def save_data(file, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        return json.dump(file, f, ensure_ascii=False, indent=2)

In [33]:
def unit_updated(unit):
#     value = unit.find(class_='title_single_area').get_text()
    units = r'mg|mm|ml|g|kg|l|pcs|cm|m'
    units_pattern = fr'\d{1,3}({units})+'
    words_pattern = r'\d+(\.\d+)?\s\D+'
#     units_value = re.search(units_pattern, unit, re.I).group()
#     words_value = re.search(words_pattern, unit, re.I).group()
    if re.search(units_pattern, unit, re.I).group():  # units_value:
        return re.search('1000', unit, re.I).group()
    elif re.search(words_pattern, unit, re.I).group(): #words_value:
        return re.search(words_pattern, unit, re.I).group()
    else:
        return unit.split()[-1]

In [34]:
unit_updated('1000 Stories Zinfandel 750ml')

AttributeError: 'NoneType' object has no attribute 'group'