### 0. Imports

In [1]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns


# append parent folder to path
import sys 
sys.path.append("..")

# import database connection support functions
from src.support.data_load_support import connect_to_database, connect_and_query, alter_update_query
from src.support.data_visualization_support import plot_bar_labels

# import env variables
import os
from dotenv import load_dotenv
load_dotenv()
database_credentials = {
    "username": os.getenv("DB_USERNAME"),
    "password": os.getenv("DB_PASSWORD")
}


# 1. Introduction to this notebook

# 2. Analysis

- Análisis de Datos: Debes realizar los siguientes análisis utilizando Python y Pandas:

    - Comparación de Precios entre Supermercados: Determinar qué supermercados ofrecen los precios más bajos y cuáles son más caros para cada producto.

Add latest price column to supermarkets_products table

In [None]:
# create new column latest price in supermarkets_products
alter_query = """
ALTER TABLE supermarkets_products 
ADD COLUMN latest_price NUMERIC;
"""

alter_update_query("comparativa_supermercados",database_credentials,alter_query)

# fill in new column with latest price values
update_query = """
WITH most_recent_date AS (
			SELECT supermarket_product_id , price_amount,date, MAX(date) OVER(PARTITION BY supermarket_product_id) AS most_recent
			FROM prices p),

latest_price AS (SELECT supermarket_product_id, price_amount, date
FROM most_recent_date
WHERE date = most_recent)

UPDATE supermarkets_products s
SET latest_price = l.price_amount
FROM latest_price l
WHERE l.supermarket_product_id = s.supermarket_product_id;
"""

alter_update_query("comparativa_supermercados",database_credentials, update_query)

What are the main category price comparison between products?

In [None]:
query = """
SELECT s2.supermarket_name, p.units, AVG(sp.latest_price /(p.quantity * p.volume_weight)) AS price_per_liter, COUNT(*) n_products
FROM products p 
INNER JOIN subcategories s 
	ON p.subcategory_id =s.subcategory_id 
INNER JOIN categories c 
	ON s.category_id = c.category_id 
INNER JOIN supermarkets_products sp 
	ON p.product_id = sp.product_id
INNER JOIN supermarkets s2
	ON sp.supermarket_id = s2.supermarket_id 
WHERE s.subcategory_name != 'otras' -- helps distinguish not true category products, like milk hair masks o tuna in olive oil  
AND units IS NOT NULL 
AND units != 'g'
AND s.subcategory_name NOT LIKE '%otras%'
GROUP BY s2.supermarket_name,  p.units
ORDER BY price_per_liter;
"""

supermarket_average = connect_and_query("comparativa_supermercados",database_credentials, query, columns="query")
supermarket_average

Here we can see that, in general, mercadona has a less expensive price per liter. That can be greatly influenced by the fact that it is the only supermarket that sells allmost exclusively its one white label brand.

In [None]:
query = """
SELECT s2.supermarket_name, p.units, sp.latest_price / (p.quantity * p.volume_weight) AS price_per_liter
FROM products p 
INNER JOIN subcategories s 
	ON p.subcategory_id =s.subcategory_id 
INNER JOIN categories c 
	ON s.category_id = c.category_id 
INNER JOIN supermarkets_products sp 
	ON p.product_id = sp.product_id
INNER JOIN supermarkets s2
	ON sp.supermarket_id = s2.supermarket_id 
WHERE s.subcategory_name != 'otras' -- helps distinguish not true category products, like milk hair masks o tuna in olive oil  
AND units IS NOT NULL 
AND units != 'g'
AND s.subcategory_name NOT LIKE '%otras%'
ORDER BY price_per_liter;
"""

price_per_category_plot_data = connect_and_query("comparativa_supermercados",database_credentials, query, columns="query")
price_per_category_plot_data

In [None]:
query1 = """
SELECT s2.supermarket_name, sp.latest_price / (p.quantity * p.volume_weight) AS price_per_liter
FROM products p 
INNER JOIN subcategories s 
	ON p.subcategory_id =s.subcategory_id 
INNER JOIN categories c 
	ON s.category_id = c.category_id 
INNER JOIN supermarkets_products sp 
	ON p.product_id = sp.product_id
INNER JOIN supermarkets s2
	ON sp.supermarket_id = s2.supermarket_id 
WHERE s.subcategory_name != 'otras' -- helps distinguish not true category products, like milk hair masks o tuna in olive oil  
AND units IS NOT NULL 
AND units != 'g'
AND s.subcategory_name NOT LIKE '%otras%'
ORDER BY price_per_liter;
"""


price_per_category_plot_data = connect_and_query("comparativa_supermercados",database_credentials, query1, columns="query")

query2 = """
SELECT s2.supermarket_name, sp.latest_price / (p.quantity * p.volume_weight) AS price_per_liter
FROM products p 
INNER JOIN subcategories s 
	ON p.subcategory_id =s.subcategory_id 
INNER JOIN categories c 
	ON s.category_id = c.category_id 
INNER JOIN supermarkets_products sp 
	ON p.product_id = sp.product_id
INNER JOIN supermarkets s2
	ON sp.supermarket_id = s2.supermarket_id 
INNER JOIN brands b
	ON b.brand_id = p.brand_id 
WHERE s.subcategory_name != 'otras' -- helps distinguish not true category products, like milk hair masks o tuna in olive oil  
AND units IS NOT NULL 
AND units != 'g'
AND s.subcategory_name NOT LIKE '%otras%'
AND b.brand_name IN ('alcampo','eroski','hacendado','carrefour','dia','hipercor','el corte ingles')
ORDER BY price_per_liter;
"""

price_per_category_white_label_plot_data = connect_and_query("comparativa_supermercados",database_credentials, query2, columns="query")

fig, (ax1,ax2) = plt.subplots(2, 1, figsize=(20, 14)) 

plt.suptitle("Average price per liter of product per supermarket.", fontsize=20)

##First plot
sns.barplot(data=price_per_category_plot_data,
             x="supermarket_name",
             y="price_per_liter",
             ax=ax1,
             order=supermarket_average["supermarket_name"],
             estimator="mean")

# adding data labels
plot_bar_labels(ax1)


ax1.set_ylabel("Price per liter (€)", fontsize=14)
ax1.set_xlabel("Supermarket")


## second plot
sns.barplot(data=price_per_category_white_label_plot_data,
             x="supermarket_name",
             y="price_per_liter",
             ax=ax2,
             order=supermarket_average["supermarket_name"],
             estimator="mean")

# adding data labels
plot_bar_labels(ax2)


ax1.set_ylabel("Price per liter (€)", fontsize=14)
ax1.set_xlabel("Supermarket")

plt.tight_layout()
plt.show()


    - Análisis de la Evolución de Precios: Estudiar cómo han cambiado los precios de los productos a lo largo del tiempo en distintos supermercados.

    - Detección de Anomalías: Identificar subidas o bajadas de precios inusuales que podrían señalar prácticas abusivas o promociones.

    - Análisis de la Dispersión de Precios: Evaluar la variabilidad de los precios de un mismo producto en diferentes supermercados.

    - Comparación de Precios Promedio: Calcular y comparar los precios promedio de cada producto en diferentes supermercados.


- Visualización de datos: Generar gráficos y visualizaciones que presenten de manera clara y comprensible los resultados del análisis.