# 03 — Insights & Business Findings
_Date: 2025-09-13_

This notebook consolidates **key KPIs** and **business insights** based on the
tables created in `olist.db`. No charts here — only numbers and concise tables.


In [10]:
import sqlite3, pandas as pd, os
from pathlib import Path

DB_PATH = os.environ.get('OLIST_DB_PATH', 'C:/Users/nazar/OneDrive/Documentos/data/olist.db')
assert os.path.exists(DB_PATH), f"Database not found: {DB_PATH}. Build it first with src/data_loader.py."

def sql(q: str, params=None):
    with sqlite3.connect(DB_PATH) as conn:
        return pd.read_sql_query(q, conn, params=params)

print('Using DB:', DB_PATH)


Using DB: C:/Users/nazar/OneDrive/Documentos/data/olist.db


In [11]:
import sqlite3

conn = sqlite3.connect(DB_PATH)
tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print("Tablas disponibles:", tables)





Tablas disponibles: [('customers',), ('geolocation',), ('order_items',), ('order_payments',), ('order_reviews',), ('orders',), ('products',), ('sellers',), ('product_category_name_translation',)]


In [14]:
DB_PATH = r'C:/Users/nazar/OneDrive/Documentos/data/olist.db'
import sqlite3
conn = sqlite3.connect(DB_PATH)
tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print("Tablas disponibles:", tables)

Tablas disponibles: [('customers',), ('geolocation',), ('order_items',), ('order_payments',), ('order_reviews',), ('orders',), ('products',), ('sellers',), ('product_category_name_translation',)]


In [18]:
# --- Core counts and KPIs ---
q_counts = '''
WITH base AS (
  SELECT order_id, order_status FROM orders
)
SELECT 
  COUNT(*)                         AS total_orders,
  SUM(CASE WHEN order_status='delivered' THEN 1 ELSE 0 END) AS delivered_orders,
  SUM(CASE WHEN order_status='canceled'  THEN 1 ELSE 0 END) AS canceled_orders
FROM base;
'''
counts = sql(q_counts).iloc[0]
total_orders = int(counts['total_orders'])
delivered_orders = int(counts['delivered_orders'])
canceled_orders = int(counts['canceled_orders'])

q_rev = '''
SELECT ROUND(SUM(p.payment_value),2) AS revenue
FROM order_payments p
JOIN orders o USING(order_id)
WHERE o.order_status='delivered';
'''
revenue = float(sql(q_rev)['revenue'].iloc[0] or 0)
AOV = round(revenue / delivered_orders, 2) if delivered_orders else None
delivered_rate = round(delivered_orders / total_orders, 4) if total_orders else None
cancellation_rate = round(canceled_orders / total_orders, 4) if total_orders else None

q_repeat = '''
WITH delivered AS (
    SELECT customer_id, order_id
    FROM orders
    WHERE order_status = 'delivered'
),
counts AS (
    SELECT customer_id, COUNT(*) AS n_orders
    FROM delivered
    GROUP BY customer_id
)
SELECT
    SUM(CASE WHEN n_orders > 1 THEN 1 ELSE 0 END) AS repeat_customers,
    COUNT(*)                                     AS customers_with_orders
FROM counts;
'''

rep = sql(q_repeat).iloc[0]
repeat_customers = int(rep['repeat_customers'])
customers_with_orders = int(rep['customers_with_orders'])
repeat_rate = round(repeat_customers / customers_with_orders, 4) if customers_with_orders else None


In [20]:
q_payment_mix = '''
WITH delivered AS (
  SELECT order_id FROM orders WHERE order_status='delivered'
)
SELECT 
  p.payment_type,
  COUNT(*)                               AS payments,
  ROUND(SUM(p.payment_value), 2)         AS revenue,
  ROUND(SUM(p.payment_value) / COUNT(DISTINCT p.order_id), 2) AS AOV
FROM order_payments p
JOIN delivered d ON d.order_id = p.order_id
GROUP BY 1
ORDER BY revenue DESC;
'''
sql(q_payment_mix)


Unnamed: 0,payment_type,payments,revenue,AOV
0,credit_card,74586,12101094.88,162.86
1,boleto,19191,2769932.58,144.33
2,voucher,5493,343013.19,93.24
3,debit_card,1486,208421.12,140.35


In [21]:
q_top_categories = '''
SELECT 
  p.product_category_name AS category,
  ROUND(SUM(oi.price), 2) AS revenue,
  COUNT(DISTINCT oi.order_id) AS orders
FROM order_items oi
JOIN products p   ON p.product_id = oi.product_id
JOIN orders   o   ON o.order_id   = oi.order_id
WHERE o.order_status='delivered'
GROUP BY 1
ORDER BY revenue DESC
LIMIT 10;
'''
sql(q_top_categories)


Unnamed: 0,category,revenue,orders
0,beleza_saude,1233131.72,8647
1,relogios_presentes,1166176.98,5495
2,cama_mesa_banho,1023434.76,9272
3,esporte_lazer,954852.55,7530
4,informatica_acessorios,888724.61,6530
5,moveis_decoracao,711927.69,6307
6,utilidades_domesticas,615628.69,5743
7,cool_stuff,610204.1,3559
8,automotivo,578966.65,3810
9,brinquedos,471286.48,3804


In [22]:
q_rev_by_state = '''
SELECT 
  c.customer_state AS state,
  ROUND(SUM(oi.price), 2) AS revenue,
  COUNT(DISTINCT oi.order_id) AS orders
FROM order_items oi
JOIN orders o    ON o.order_id    = oi.order_id
JOIN customers c ON c.customer_id = o.customer_id
WHERE o.order_status='delivered'
GROUP BY 1
ORDER BY revenue DESC
LIMIT 20;
'''
sql(q_rev_by_state)


Unnamed: 0,state,revenue,orders
0,SP,5067633.16,40501
1,RJ,1759651.13,12350
2,MG,1552481.83,11354
3,RS,728897.47,5345
4,PR,666063.51,4923
5,SC,507012.13,3546
6,BA,493584.14,3256
7,DF,296498.41,2080
8,GO,282836.7,1957
9,ES,268643.45,1995


In [24]:
q_delivery_reviews = '''
WITH delivered AS (
    SELECT
        o.order_id,
        julianday(o.order_delivered_customer_date) - julianday(o.order_purchase_timestamp) AS delivery_days
    FROM orders o
    WHERE o.order_status = 'delivered'
)
SELECT
    ROUND(AVG(delivery_days), 2) AS avg_delivery_days,
    ROUND(AVG(r.review_score), 2) AS avg_review_score
FROM delivered d
LEFT JOIN order_reviews r ON r.order_id = d.order_id;
'''

sql(q_delivery_reviews)


Unnamed: 0,avg_delivery_days,avg_review_score
0,12.56,4.16


## Export to HTML
This cell saves a clean HTML report 03_insights_report.html. It uses the KPI and tables computed above. If variables are missing, it will recompute them.

In [36]:
from pathlib import Path
import os, sqlite3, pandas as pd

out = Path('C:/Users/nazar/OneDrive/Documentos/data/03_insights_report.html')
out.parent.mkdir(parents=True, exist_ok=True)

def safe_tbl(df):
    return df.to_html(index=False, border=0, classes='table') if isinstance(df, pd.DataFrame) else '<em>no data</em>'

# If KPIs/tables aren't defined (e.g., after fresh kernel), recompute quickly
if 'kpis' not in globals() or 'payment_mix' not in globals() or 'top_categories' not in globals() or 'rev_by_state' not in globals() or 'delivery_reviews' not in globals():
    DB_PATH = os.environ.get('OLIST_DB_PATH', 'C:/Users/nazar/OneDrive/Documentos/data/olist.db')
    assert os.path.exists(DB_PATH), f"Database not found: {DB_PATH}"
    def sql(q):
        with sqlite3.connect(DB_PATH) as conn:
            return pd.read_sql_query(q, conn)
    # KPIs recompute
    counts = sql('''
        WITH base AS (SELECT order_id, order_status FROM orders)
        SELECT COUNT(*) AS total_orders,
               SUM(CASE WHEN order_status='delivered' THEN 1 ELSE 0 END) AS delivered_orders,
               SUM(CASE WHEN order_status='canceled' THEN 1 ELSE 0 END) AS canceled_orders
        FROM base;''').iloc[0]
    total_orders = int(counts['total_orders']); delivered_orders = int(counts['delivered_orders']); canceled_orders = int(counts['canceled_orders'])
    revenue = float(sql("""
        SELECT ROUND(SUM(p.payment_value),2) AS revenue
        FROM order_payments p JOIN orders o USING(order_id)
        WHERE o.order_status='delivered';
    """)['revenue'].iloc[0] or 0)
    AOV = round(revenue / delivered_orders, 2) if delivered_orders else None
    delivered_rate = round(delivered_orders / total_orders, 4) if total_orders else None
    cancellation_rate = round(canceled_orders / total_orders, 4) if total_orders else None
    rep = sql('''
        WITH delivered AS (
          SELECT o.customer_id, o.order_id FROM orders o WHERE o.order_status='delivered')
        SELECT COUNT(DISTINCT CASE WHEN cnt>=2 THEN customer_id END) AS repeat_customers,
               COUNT(DISTINCT customer_id) AS customers_with_orders
        FROM (SELECT customer_id, COUNT(*) AS cnt FROM delivered GROUP BY 1);
    ''').iloc[0]
    repeat_customers = int(rep['repeat_customers']); customers_with_orders = int(rep['customers_with_orders'])
    repeat_rate = round(repeat_customers / customers_with_orders, 4) if customers_with_orders else None
    kpis = pd.DataFrame([{
        'Total Revenue ($)': revenue,
        'AOV ($)': AOV,
        'Delivered Rate (%)': round((delivered_rate or 0)*100, 2),
        'Cancellation Rate (%)': round((cancellation_rate or 0)*100, 2),
        'Repeat Purchase Rate (%)': round((repeat_rate or 0)*100, 2)
    }])
    payment_mix = sql('''
        WITH d AS (SELECT order_id FROM orders WHERE order_status='delivered')
        SELECT p.payment_type, COUNT(*) AS order_payments, ROUND(SUM(p.payment_value),2) AS revenue,
               ROUND(SUM(p.payment_value) / COUNT(DISTINCT p.order_id),2) AS AOV
        FROM order_payments p JOIN d ON d.order_id=p.order_id
        GROUP BY 1 ORDER BY revenue DESC;''')
    top_categories = sql('''
        SELECT p.product_category_name AS category, ROUND(SUM(oi.price),2) AS revenue,
               COUNT(DISTINCT oi.order_id) AS orders
        FROM order_items oi JOIN products p ON p.product_id=oi.product_id
        JOIN orders o ON o.order_id=oi.order_id
        WHERE o.order_status='delivered'
        GROUP BY 1 ORDER BY revenue DESC LIMIT 10;''')
    rev_by_state = sql('''
        SELECT c.customer_state AS state, ROUND(SUM(oi.price),2) AS revenue,
               COUNT(DISTINCT oi.order_id) AS orders
        FROM order_items oi JOIN orders o ON o.order_id=oi.order_id
        JOIN customers c ON c.customer_id=o.customer_id
        WHERE o.order_status='delivered'
        GROUP BY 1 ORDER BY revenue DESC LIMIT 20;''')
    delivery_reviews = sql('''
        WITH d AS (
          SELECT o.order_id,
                 julianday(o.order_delivered_customer_date) - julianday(o.order_purchase_timestamp) AS delivery_days
          FROM orders o WHERE o.order_status='delivered')
        SELECT ROUND(AVG(delivery_days),2) AS avg_delivery_days,
               ROUND(AVG(r.review_score),2) AS avg_review_score
        FROM d LEFT JOIN order_reviews r ON r.order_id=d.order_id;''')

html = f'''
<!doctype html>
<html><head><meta charset='utf-8'><title>03 — Insights Report</title>
<style>body{{font-family:Arial,sans-serif;margin:24px}} .table{{border-collapse:collapse;width:100%}} .table th,.table td{{border:1px solid #ddd;padding:8px}} .table th{{background:#f6f6f6;text-align:left}}</style>
</head><body>
<h1>03 — Insights & Business Findings</h1>
<h2>KPIs</h2>{safe_tbl(kpis)}
<h2>Payment Mix</h2>{safe_tbl(payment_mix)}
<h2>Top Categories by Revenue</h2>{safe_tbl(top_categories)}
<h2>Revenue by Customer State</h2>{safe_tbl(rev_by_state)}
<h2>Delivery Performance & Reviews</h2>{safe_tbl(delivery_reviews)}
</body></html>
'''
out.write_text(html, encoding='utf-8')
out

WindowsPath('C:/Users/nazar/OneDrive/Documentos/data/03_insights_report.html')