### 0. Imports

In [6]:
from bs4 import BeautifulSoup

import requests

import pandas as pd
import numpy as np

import time

# from selenium import webdriver 
# from webdriver_manager.chrome import ChromeDriverManager  
# from selenium.webdriver.common.keys import Keys  
# from selenium.webdriver.support.ui import Select 
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import NoSuchElementException 


# 1. Introduction to this notebook

In this notebook, the purpose is to outline and guide in the logical process of extracting the data through scraping. The goal is to extract a data from historical data of supermarket product prices, divided by different supermarket chains. 

The main source used for this extraction will be [FACUA](https://super.facua.org/). 

#### Get suppermarkets urls to scrape by surface

During an initial exploration of the main page of FACUA, buttons quickly appear for every supermarket with available data. The goal is to access those hrefs, if possible, or navigate using those buttons, to be driven to their individual pages.






![surfaces.png](../assets/surfaces.png)

Let's try parsing the main html looking for the hrefs inside those buttons.

In [None]:
link = "https://super.facua.org"

response = requests.get(link)

if response.status_code == 200:
    print("Successful connection.")

else:
    print("Connection failed.")

main_soup = BeautifulSoup(response.content, "html.parser")
main_soup

Successful connection.



<!DOCTYPE html>

<html lang="es-ES">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<meta content="Te ayudamos a comparar y a descubrir cuándo y cuánto suben los productos básicos en los grandes supermercados" name="description"/>
<meta content="super.FACUA.org" name="author"/>
<meta content="iva aceite de oliva" name="Keywords"/>
<meta content="all" name="googlebot"/>
<meta content="index" name="googlebot"/>
<meta content="follow" name="googlebot"/>
<meta content="all" name="robots"/>
<meta content="index" name="robots"/>
<meta content="follow" name="robots"/>
<meta content="index,follow" name="robots"/>
<meta content="es" http-equiv="Content-Language">
<title>FACUA vigila los precios de los alimentos para ti</title>
<!-- Favicon-->
<link href="https://super.facua.org/assets/favicon1.ico" rel="icon" type="image/x-icon"/>
<!-- Bootstrap icons-->
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.5.0/font/b

Looking for the keywords "Precios en __", hrefs are found rather fast. Therefore, let's extract them that way.

In [29]:
supermarket_cards = main_soup.findAll("div",{"class":"card h-100"})

print(f"There are {len(supermarket_cards)} supermarket cards.")


There are 6 supermarket cards.


There are as many supermarket cards in the parsed html as in the visual exploration of the website. Each cards has the individual hrefs for the pages.

In [30]:
supermarket_links = [card.find("a")["href"] for card in supermarket_cards]
supermarket_links

['https://super.facua.org/mercadona/',
 'https://super.facua.org/carrefour/',
 'https://super.facua.org/eroski/',
 'https://super.facua.org/dia/',
 'https://super.facua.org/hipercor/',
 'https://super.facua.org/alcampo/']

Now that we have the link, let's define the process to extract the prices from one supermarket. Then, it will be a matter of replicating it over the remaining 5.

In [34]:
mercadona_link = supermarket_links[0]

response_mercadona = requests.get(mercadona_link)

if response_mercadona.status_code == 200:
    print("Successful connection.")

else:
    print("Connection failed.")

mercadona_soup = BeautifulSoup(response_mercadona.content, "html.parser")

Successful connection.


In [32]:
product_category_cards = mercadona_soup.findAll("div",{"class":"card h-100"})

print(f"There are {len(product_category_cards)} product cards.")

There are 3 product cards.


In [33]:
product_category_names = [card.find("p").text.strip() for card in product_category_cards]

product_category_links = [card.find("a")["href"] for card in product_category_cards]

for name, link in zip(product_category_names, product_category_links):
    print(f"Product category: {name}. Link: {link}")


Product category: Aceite de girasol. Link: https://super.facua.org/mercadona/aceite-de-girasol/
Product category: Aceite de oliva. Link: https://super.facua.org/mercadona/aceite-de-oliva/
Product category: Leche. Link: https://super.facua.org/mercadona/leche/


In [37]:
first_category_link = product_category_links[0]

response_first_category = requests.get(first_category_link)

if response_first_category.status_code == 200:
    print("Successful connection.")

else:
    print("Connection failed.")

first_category_soup = BeautifulSoup(response_first_category.content, "html.parser")

Successful connection.


In [40]:
product_cards = first_category_soup.findAll("div",{"class":"card h-100"})

print(f"There are {len(product_cards)} product cards.\n")

product_names = [card.find("p").text.strip() for card in product_cards]

product_links = [card.find("a")["href"] for card in product_cards]

for name, link in zip(product_names, product_links):
    print(f"Product category: {name}. Link: {link}")



There are 2 product cards.

Product category: Aceite De Girasol Refinado 0,2º Hacendado 1 L.. Link: https://super.facua.org/mercadona/aceite-de-girasol/aceite-de-girasol-refinado-02-hacendado-1-l/
Product category: Aceite De Girasol Refinado 0,2º Hacendado 5 L.. Link: https://super.facua.org/mercadona/aceite-de-girasol/aceite-de-girasol-refinado-02-hacendado-5-l/


In [41]:
first_product_link = product_links[0]

response_first_product = requests.get(first_product_link)

if response_first_product.status_code == 200:
    print("Successful connection.")

else:
    print("Connection failed.")

first_category_soup = BeautifulSoup(response_first_product.content, "html.parser")

Successful connection.


In [47]:
tables = first_category_soup.findAll("table")

print(f"There are {len(tables)} tables.\n")

product_price_table = tables[0]
product_price_table

There are 1 tables.



<table class="table table-striped table-responsive text-center" style="width:100%"><thead><tr><th scope="col">Día</th><th scope="col">Precio (€)</th><th scope="col">Variación</th></tr></thead><tbody><tr><td>12/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>13/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>14/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>15/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>16/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>17/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>18/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>19/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>20/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>21/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>22/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>23/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>24/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>25/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>26/07/2024</td><td>1,45</td><td>=</td></tr><tr><td>27/07/2024</td><td>1,45</td>

In [62]:
product_table_head = [element.text.strip() for element in product_price_table.find("thead").findAll("th")][:2]
product_table_head

['Día', 'Precio (€)']

In [63]:
product_table_body = [[element.text.strip() for element in row.findAll("td")][:2] for row in product_price_table.find("tbody").findAll("tr")]
product_table_body[:5]

[['12/07/2024', '1,45'],
 ['13/07/2024', '1,45'],
 ['14/07/2024', '1,45'],
 ['15/07/2024', '1,45'],
 ['16/07/2024', '1,45']]

In [64]:
pd.DataFrame(product_table_body)

Unnamed: 0,0,1
0,12/07/2024,145
1,13/07/2024,145
2,14/07/2024,145
3,15/07/2024,145
4,16/07/2024,145
...,...,...
101,21/10/2024,148
102,22/10/2024,148
103,23/10/2024,148
104,24/10/2024,148


If the structure repeats along all products, the extraction will follow this pattern as a whole. First, let's define functions from bottom to top level.

Extract product table

In [67]:
def extract_table_from_link(link, supermarket_name,category_name, product_name):

    # make request
    response = requests.get(link)

    # check response
    if response.status_code == 200:
        print("Successful connection.")

    else:
        print("Connection failed.")

    # parse html
    product_data_soup = BeautifulSoup(response.content, "html.parser")

    # extract table header and body
    table_head_list = [element.text.strip() for element in product_data_soup.find("thead").findAll("th")][:2]

    table_body_list = [[element.text.strip() for element in row.findAll("td")][:2] for row in product_data_soup.find("tbody").findAll("tr")]

    # convert to dataframe and return
    extracted_table_df = pd.DataFrame(table_body_list, columns=table_head_list)
    extracted_table_df[["product_name","category_name","supermarket_name"]] = product_name, category_name, supermarket_name

    return extracted_table_df

- Extract product names and links


In [None]:
def extract_productnames_links(link, supermarket_name, category_name):

    # make request
    response = requests.get(link)

    # check response
    if response.status_code == 200:
        print("Successful connection.")

    else:
        print("Connection failed.")

    # parse html
    products_soup = BeautifulSoup(response.content, "html.parser")

    product_cards = products_soup.findAll("div",{"class":"card h-100"})

    product_names = [card.find("p").text.strip() for card in product_cards]

    product_links = [card.find("a")["href"] for card in product_cards]

    supermarket_name_repeat = supermarket_name * len(product_links)

    category_name_repeat = category_name * len(product_links)

    return supermarket_name_repeat, category_name_repeat, product_names, product_links

In [None]:
def extract_categorynames_links(link, supermarket_name):

    # make request
    response = requests.get(link)

    # check response
    if response.status_code == 200:
        print("Successful connection.")

    else:
        print("Connection failed.")

    # parse html
    categories_soup = BeautifulSoup(response.content, "html.parser")

    category_cards = categories_soup.findAll("div",{"class":"card h-100"})

    category_names = [card.find("p").text.strip() for card in category_cards]

    category_links = [card.find("a")["href"] for card in category_cards]

    supermarket_name_repeat = supermarket_name * len(category_links)

    return supermarket_name_repeat, category_names, category_links

In [None]:
def extract_supermarkets(link):

    # make request
    response = requests.get(link)

    # check response
    if response.status_code == 200:
        print("Successful connection.")

    else:
        print("Connection failed.")

    # parse html
    main_soup = BeautifulSoup(response.content, "html.parser")

    supermarket_cards = main_soup.findAll("div",{"class":"card h-100"})

    supermarket_names = [card.find("p").text.strip() for card in supermarket_cards]

    supermarket_links = [card.find("a")["href"] for card in supermarket_cards]

    return supermarket_names, supermarket_links


- Extract product caegories names and links
- Extract supermarket names and links

In [None]:
# link = "https://super.facua.org"

# response = requests.get(link)

# if response.status_code == 200:
#     print("Successful connection.")

# else:
#     print("Connection failed.")

# main_soup = BeautifulSoup(response.content, "html.parser")
# main_soup

# supermarket_cards = main_soup.findAll("div",{"class":"card h-100"})

# supermarket_names = [card.find("p").text.strip() for card in supermarket_cards]

# supermarket_links = [card.find("a")["href"] for card in supermarket_cards]


# for supermarket_name, supermarket_link in zip(supermarket_names,supermarket_links):

#     mercadona_link = supermarket_links[0]

#     response_mercadona = requests.get(mercadona_link)

#     if response_mercadona.status_code == 200:
#         print("Successful connection.")

#     else:
#         print("Connection failed.")

#     mercadona_soup = BeautifulSoup(response_mercadona.content, "html.parser")