In [22]:
%pip install -q pandas sqlalchemy psycopg2-binary python-dotenv pyarrow tqdm

Note: you may need to restart the kernel to use updated packages.


In [23]:
import pandas as pd
from sqlalchemy import create_engine
from tqdm import tqdm
from pathlib import Path
print("OK imports")

OK imports


In [None]:
# 1) Detectar automaticamente a raiz que contém "olist-ecommerce-pipeline"
CWD = Path.cwd()
PROJECT_ROOT = None
for candidate in [CWD, *CWD.parents]:
    if (candidate / "olist-ecommerce-pipeline").exists():
        PROJECT_ROOT = candidate
        break

if PROJECT_ROOT is None:
    raise FileNotFoundError(
        f'Não achei a pasta "olist-ecommerce-pipeline" a partir de {CWD}. '
        f'Abra o notebook a partir do repositório ou mova este .ipynb para dentro dele.'
    )

# 2) Recalcular caminhos com base na raiz correta
RAW_DIR = PROJECT_ROOT / "olist-ecommerce-pipeline" / "data" / "raw"
DDL_PATH = PROJECT_ROOT / "olist-ecommerce-pipeline" / "Silver" / "Olist-script-postgres.sql"

print("CWD:", CWD)
print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR)
print("DDL_PATH:", DDL_PATH)

# 3) Validar que CSVs existem e sao kegiveis
assert RAW_DIR.exists(), f"Pasta Bronze (raw) não encontrada: {RAW_DIR}"

# 4) Checagem mínima: tentar ler 5 linhas do orders
orders_csv = RAW_DIR / "olist_orders_dataset.csv"
assert orders_csv.exists(), f"Arquivo esperado não encontrado: {orders_csv}"
display(pd.read_csv(orders_csv, nrows=5).head())
print("✅ Bronze encontrada e legível.")


CWD: C:\Users\letic\Documents\UnB\Sistema de Bancos de Dados 2\brazilian_e-commerce_analysis
PROJECT_ROOT: C:\Users\letic\Documents\UnB\Sistema de Bancos de Dados 2\brazilian_e-commerce_analysis
RAW_DIR: C:\Users\letic\Documents\UnB\Sistema de Bancos de Dados 2\brazilian_e-commerce_analysis\olist-ecommerce-pipeline\data\raw
DDL_PATH: C:\Users\letic\Documents\UnB\Sistema de Bancos de Dados 2\brazilian_e-commerce_analysis\olist-ecommerce-pipeline\Silver\Olist-script-postgres.sql


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


✅ Bronze encontrada e legível.


## DB Config & Connection Test

- Lê variáveis do `.env` (se existir) ou usa defaults locais.
- Testa conexão, cria o schema `silver` (se não existir) e ajusta `search_path`.
- Não falha o notebook se o Postgres não estiver no ar apenas avisa.


In [25]:
import os
from pathlib import Path
from contextlib import suppress
from sqlalchemy import create_engine, text

# 1) Carregar .env se existir (opcional)
ENV_PATH = PROJECT_ROOT / ".env"
if ENV_PATH.exists():
    # carregamento leve do .env (sem dependências)
    with open(ENV_PATH, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#") or "=" not in line:
                continue
            k, v = line.split("=", 1)
            os.environ.setdefault(k.strip(), v.strip())

# 2) Variáveis de conexão (use as que o Levi definir; estas são defaults comuns)
DB_HOST   = os.getenv("PGHOST", "localhost")
DB_PORT   = os.getenv("PGPORT", "5432")
DB_NAME   = os.getenv("PGDATABASE", "olist")
DB_USER   = os.getenv("PGUSER", "postgres")
DB_PASS   = os.getenv("PGPASSWORD", "postgres")
DB_SCHEMA = os.getenv("PGSCHEMA", "silver")   # camada Silver

print("DB_HOST:", DB_HOST, "| DB_PORT:", DB_PORT, "| DB_NAME:", DB_NAME, "| SCHEMA:", DB_SCHEMA)

# 3) Criar engine SQLAlchemy
db_url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(db_url, pool_pre_ping=True, future=True)

# 4) Teste de conexão e criação do schema
try:
    with engine.begin() as conn:
        row = conn.exec_driver_sql("select current_database(), current_schema(), version();").fetchone()
        print("Conectado! ->", row)

        # cria schema silver se não existir e seta o search_path
        conn.exec_driver_sql(f'CREATE SCHEMA IF NOT EXISTS "{DB_SCHEMA}";')
        conn.exec_driver_sql(f'SET search_path TO "{DB_SCHEMA}", public;')

        # checar DDL
        if DDL_PATH.exists():
            print(f"DDL localizado em: {DDL_PATH} (tamanho ~{DDL_PATH.stat().st_size} bytes)")
        else:
            print("[AVISO] DDL não encontrado ainda — ok, seguimos com ETL e executamos depois.")

except Exception as e:
    print("\n[AVISO] Não foi possível conectar ao Postgres agora.")
    print("→ Motivos comuns: container não iniciado ou credenciais/DB diferentes.")
    print("→ Quando o Levi subir o docker-compose, esta célula deve funcionar.")
    print("Detalhe do erro:", repr(e))


DB_HOST: localhost | DB_PORT: 5434 | DB_NAME: olist | SCHEMA: silver
Conectado! -> ('olist', 'public', 'PostgreSQL 16.10 (Debian 16.10-1.pgdg13+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 14.2.0-19) 14.2.0, 64-bit')
DDL localizado em: C:\Users\letic\Documents\UnB\Sistema de Bancos de Dados 2\brazilian_e-commerce_analysis\olist-ecommerce-pipeline\Silver\Olist-script-postgres.sql (tamanho ~3081 bytes)


## Extract — Ler dados da Bronze (Kaggle Olist)

- Carrega todos os CSVs de `olist-ecommerce-pipeline/data/raw`.
- Define dtypes explícitos e faz *parse* de datas.
- Exibe *shape*, tipos e amostra de linhas para validação.


In [26]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm

# ---------- helpers ----------
def load_csv(filename: str, dtypes=None, parse_dates=None):
    """Carrega um CSV da Bronze, padroniza nomes das colunas (lower) e retorna DataFrame."""
    path = RAW_DIR / filename
    if not path.exists():
        raise FileNotFoundError(f"CSV não encontrado: {path}")
    df = pd.read_csv(
        path,
        dtype=dtypes or {},
        parse_dates=parse_dates or [],
        keep_default_na=True,
        encoding="utf-8",
        infer_datetime_format=True,
        low_memory=False,
    )
    df.columns = [c.strip().lower() for c in df.columns]
    return df

# dtypes por arquivo (usando tipos que aceitam NA quando necessário)
Int = "Int64"  # inteiro com suporte a NA
Str = "string"

DTYPES = {
    "olist_orders_dataset.csv": {
        "order_id": Str,
        "customer_id": Str,
        "order_status": Str,
        # timestamps lidos via parse_dates
    },
    "olist_order_items_dataset.csv": {
        "order_id": Str,
        "order_item_id": Int,
        "product_id": Str,
        "seller_id": Str,
        "price": "float64",
        "freight_value": "float64",
        # shipping_limit_date via parse_dates
    },
    "olist_order_payments_dataset.csv": {
        "order_id": Str,
        "payment_sequential": Int,
        "payment_type": Str,
        "payment_installments": Int,
        "payment_value": "float64",
    },
    "olist_order_reviews_dataset.csv": {
        "review_id": Str,
        "order_id": Str,
        "review_score": Int,
        "review_comment_title": Str,
        "review_comment_message": Str,
        # creation/answer via parse_dates
    },
    "olist_products_dataset.csv": {
        "product_id": Str,
        "product_category_name": Str,
        "product_name_lenght": Int,
        "product_description_lenght": Int,
        "product_photos_qty": Int,
        "product_weight_g": Int,
        "product_length_cm": Int,
        "product_height_cm": Int,
        "product_width_cm": Int,
    },
    "olist_sellers_dataset.csv": {
        "seller_id": Str,
        "seller_zip_code_prefix": Int,
        "seller_city": Str,
        "seller_state": Str,
    },
    "olist_customers_dataset.csv": {
        "customer_id": Str,
        "customer_unique_id": Str,
        "customer_zip_code_prefix": Int,
        "customer_city": Str,
        "customer_state": Str,
    },
    "olist_geolocation_dataset.csv": {
        "geolocation_zip_code_prefix": Int,
        "geolocation_lat": "float64",
        "geolocation_lng": "float64",
        "geolocation_city": Str,
        "geolocation_state": Str,
    },
    "product_category_name_translation.csv": {
        "product_category_name": Str,
        "product_category_name_english": Str,
    },
}

PARSE_DATES = {
    "olist_orders_dataset.csv": [
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ],
    "olist_order_items_dataset.csv": [
        "shipping_limit_date",
    ],
    "olist_order_reviews_dataset.csv": [
        "review_creation_date",
        "review_answer_timestamp",
    ],
}

CSV_LIST = [
    "olist_customers_dataset.csv",
    "olist_geolocation_dataset.csv",
    "olist_orders_dataset.csv",
    "olist_order_items_dataset.csv",
    "olist_order_payments_dataset.csv",
    "olist_order_reviews_dataset.csv",
    "olist_products_dataset.csv",
    "olist_sellers_dataset.csv",
    "product_category_name_translation.csv",
]

# ---------- load all ----------
dfs = {}
for name in tqdm(CSV_LIST, desc="Lendo Bronze"):
    dfs[name] = load_csv(
        name,
        dtypes=DTYPES.get(name),
        parse_dates=PARSE_DATES.get(name),
    )

# aliases práticos (snake_case)
customers   = dfs["olist_customers_dataset.csv"]
geos        = dfs["olist_geolocation_dataset.csv"]
orders      = dfs["olist_orders_dataset.csv"]
items       = dfs["olist_order_items_dataset.csv"]
payments    = dfs["olist_order_payments_dataset.csv"]
reviews     = dfs["olist_order_reviews_dataset.csv"]
products    = dfs["olist_products_dataset.csv"]
sellers     = dfs["olist_sellers_dataset.csv"]
prod_trans  = dfs["product_category_name_translation.csv"]

# ---------- quick summary ----------
print("\n# Shapes")
for k, v in dfs.items():
    print(f"{k:40s} -> {v.shape}")

print("\n# Dtypes (orders)")
print(orders.dtypes)

print("\n# Amostras")
display(orders.head(3))
display(items.head(3))
display(payments.head(3))

# NAs por coluna (visão geral) — útil para planejar a transformação
def na_overview(df, name):
    s = df.isna().sum()
    if (s > 0).any():
        print(f"\nNA overview — {name}")
        print(s[s > 0].sort_values(ascending=False).head(12))

na_overview(orders, "orders")
na_overview(items, "items")
na_overview(products, "products")
na_overview(reviews, "reviews")


  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
Lendo Bronze: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:03<00:00,  2.42it/s]


# Shapes
olist_customers_dataset.csv              -> (99441, 5)
olist_geolocation_dataset.csv            -> (1000163, 5)
olist_orders_dataset.csv                 -> (99441, 8)
olist_order_items_dataset.csv            -> (112650, 7)
olist_order_payments_dataset.csv         -> (103886, 5)
olist_order_reviews_dataset.csv          -> (99224, 7)
olist_products_dataset.csv               -> (32951, 9)
olist_sellers_dataset.csv                -> (3095, 4)
product_category_name_translation.csv    -> (71, 2)

# Dtypes (orders)
order_id                         string[python]
customer_id                      string[python]
order_status                     string[python]
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
dtype: object

# Amostras





Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71



NA overview — orders
order_delivered_customer_date    2965
order_delivered_carrier_date     1783
order_approved_at                 160
dtype: int64

NA overview — products
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

NA overview — reviews
review_comment_title      87656
review_comment_message    58247
dtype: int64


## Transform — Normalização para a Camada Silver

- Padroniza strings, remove nulos críticos e duplicidades.
- Enriquece `products` com tradução de categoria.
- Garante integridade referencial: `items`, `payments` e `reviews` só com `order_id` válido; `items` só com `product_id`/`seller_id` válidos.
- Agrega/geolocalização deduplicada por CEP prefixo.
- Deriva campos úteis em `orders` (datas e métricas de entrega).
- Cria dataframes finais: `silver_customers`, `silver_orders`, `silver_order_items`, `silver_products`, `silver_sellers`, `silver_payments`, `silver_reviews`, `silver_geolocation`.


In [27]:
import pandas as pd

# ---------- helpers ----------
def norm_str(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = (
                df[c]
                .astype("string")
                .str.strip()
            )

def drop_nulls(df, cols, name):
    before = len(df)
    df2 = df.dropna(subset=[c for c in cols if c in df.columns])
    removed = before - len(df2)
    if removed:
        print(f"[{name}] removidas {removed} linhas por nulos em {cols}")
    return df2

def drop_dups(df, keys, name):
    before = len(df)
    df2 = df.drop_duplicates(subset=keys, keep="first")
    removed = before - len(df2)
    if removed:
        print(f"[{name}] removidas {removed} duplicatas por {keys}")
    return df2

# ---------- Customers ----------
silver_customers = customers.copy()
norm_str(silver_customers, ["customer_id","customer_unique_id","customer_city","customer_state"])
silver_customers = drop_nulls(silver_customers, ["customer_id"], "customers")
silver_customers = drop_dups(silver_customers, ["customer_id"], "customers")

# ---------- Sellers ----------
silver_sellers = sellers.copy()
norm_str(silver_sellers, ["seller_id","seller_city","seller_state"])
silver_sellers = drop_nulls(silver_sellers, ["seller_id"], "sellers")
silver_sellers = drop_dups(silver_sellers, ["seller_id"], "sellers")

# ---------- Products (+ tradução da categoria) ----------
silver_products = products.copy()
norm_str(silver_products, ["product_id","product_category_name"])
silver_products = silver_products.merge(
    prod_trans, on="product_category_name", how="left"
)
# renomear para um nome mais amigável na Silver
silver_products = silver_products.rename(
    columns={"product_category_name_english": "product_category_en"}
)
silver_products = drop_nulls(silver_products, ["product_id"], "products")
silver_products = drop_dups(silver_products, ["product_id"], "products")

# ---------- Geolocation (1 linha por CEP prefixo) ----------
silver_geolocation = geos.copy()
silver_geolocation = drop_nulls(silver_geolocation, ["geolocation_zip_code_prefix"], "geolocation")
# manter primeira ocorrência por CEP prefixo (estratégia simples e estável)
silver_geolocation = drop_dups(silver_geolocation, ["geolocation_zip_code_prefix"], "geolocation")

# ---------- Orders (derivados de entrega) ----------
silver_orders = orders.copy()
norm_str(silver_orders, ["order_id","customer_id","order_status"])
silver_orders = drop_nulls(silver_orders, ["order_id","customer_id"], "orders")
silver_orders = drop_dups(silver_orders, ["order_id"], "orders")

# derivados úteis
silver_orders["order_purchase_date"] = silver_orders["order_purchase_timestamp"].dt.date
silver_orders["delivery_time_days"] = (
    silver_orders["order_delivered_customer_date"] - silver_orders["order_purchase_timestamp"]
).dt.days.astype("Int64")

silver_orders["delivery_delay_days"] = (
    silver_orders["order_delivered_customer_date"] - silver_orders["order_estimated_delivery_date"]
).dt.days.astype("Int64")

silver_orders["delivered_late"] = (
    (silver_orders["order_delivered_customer_date"] > silver_orders["order_estimated_delivery_date"])
).astype("Int64")

# ---------- Items (FKs válidas) ----------
valid_orders = set(silver_orders["order_id"])
valid_products = set(silver_products["product_id"])
valid_sellers  = set(silver_sellers["seller_id"])

silver_order_items = items.copy()
norm_str(silver_order_items, ["order_id","product_id","seller_id"])
silver_order_items = drop_nulls(silver_order_items, ["order_id","order_item_id","product_id","seller_id"], "order_items")
silver_order_items = silver_order_items[
    silver_order_items["order_id"].isin(valid_orders)
    & silver_order_items["product_id"].isin(valid_products)
    & silver_order_items["seller_id"].isin(valid_sellers)
].copy()
silver_order_items = drop_dups(silver_order_items, ["order_id","order_item_id"], "order_items")

# ---------- Payments (só orders válidos) ----------
silver_payments = payments.copy()
norm_str(silver_payments, ["order_id","payment_type"])
silver_payments = drop_nulls(silver_payments, ["order_id"], "payments")
silver_payments = silver_payments[silver_payments["order_id"].isin(valid_orders)].copy()

# ---------- Reviews (só orders válidos) ----------
silver_reviews = reviews.copy()
norm_str(silver_reviews, ["review_id","order_id"])
silver_reviews = drop_nulls(silver_reviews, ["review_id","order_id"], "reviews")
silver_reviews = silver_reviews[silver_reviews["order_id"].isin(valid_orders)].copy()
silver_reviews = drop_dups(silver_reviews, ["review_id"], "reviews")

# ---------- sanity check final ----------
summary = {
    "silver_customers": silver_customers.shape,
    "silver_sellers": silver_sellers.shape,
    "silver_products": silver_products.shape,
    "silver_geolocation": silver_geolocation.shape,
    "silver_orders": silver_orders.shape,
    "silver_order_items": silver_order_items.shape,
    "silver_payments": silver_payments.shape,
    "silver_reviews": silver_reviews.shape,
}
print("# Shapes (Silver)")
for k, v in summary.items():
    print(f"{k:20s} -> {v}")

# amostras
display(silver_orders.head(3))
display(silver_order_items.head(3))
display(silver_products.head(3))


[geolocation] removidas 981148 duplicatas por ['geolocation_zip_code_prefix']
[reviews] removidas 814 duplicatas por ['review_id']
# Shapes (Silver)
silver_customers     -> (99441, 5)
silver_sellers       -> (3095, 4)
silver_products      -> (32951, 10)
silver_geolocation   -> (19015, 5)
silver_orders        -> (99441, 12)
silver_order_items   -> (112650, 7)
silver_payments      -> (103886, 5)
silver_reviews       -> (98410, 7)


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_purchase_date,delivery_time_days,delivery_delay_days,delivered_late
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,2017-10-02,8,-8,0
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,2018-07-24,13,-6,0
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,2018-08-08,9,-18,0


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_en
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40,287,1,225,16,10,14,perfumery
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44,276,1,1000,30,18,20,art
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46,250,1,154,18,9,15,sports_leisure


# Conectar e aplicar o DDL

In [28]:
# 2.6.1 — Conectar no Postgres e aplicar o DDL (Silver)

import os
from pathlib import Path
from sqlalchemy import create_engine

# Reusa PROJECT_ROOT, DDL_PATH, etc. já definidos antes
ENV_PATH = PROJECT_ROOT / ".env"
if ENV_PATH.exists():
    for line in ENV_PATH.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        k, v = line.split("=", 1)
        os.environ.setdefault(k.strip(), v.strip())

DB_HOST   = os.getenv("PGHOST", "localhost")
DB_PORT   = os.getenv("PGPORT", "5432")
DB_NAME   = os.getenv("PGDATABASE", "olist")
DB_USER   = os.getenv("PGUSER", "postgres")
DB_PASS   = os.getenv("PGPASSWORD", "postgres")
DB_SCHEMA = os.getenv("PGSCHEMA", "silver")   # camada Silver

db_url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(db_url, pool_pre_ping=True, future=True)

print("→ Tentando conectar…")
try:
    with engine.begin() as conn:
        row = conn.exec_driver_sql("select current_database(), current_schema(), version();").fetchone()
        print("Conectado! ->", row)

        # Garante o schema Silver e seta o search_path
        conn.exec_driver_sql(f'CREATE SCHEMA IF NOT EXISTS "{DB_SCHEMA}";')
        conn.exec_driver_sql(f'SET search_path TO "{DB_SCHEMA}", public;')

        # Executa o DDL do Pablo (se existir)
        if DDL_PATH.exists():
            sql = DDL_PATH.read_text(encoding="utf-8").strip()
            print(f"→ Executando DDL de {DDL_PATH.name} (tamanho ~{DDL_PATH.stat().st_size} bytes)…")
            conn.exec_driver_sql(sql)
            print("✅ DDL aplicado.")
        else:
            print("⚠️ DDL não encontrado — seguiremos com to_sql para criar as tabelas.")

except Exception as e:
    print("\n[AVISO] Não foi possível conectar ao Postgres agora.")
    print("→ Quando o Levi subir o docker-compose, esta célula passa a funcionar sem mudar nada.")
    print("Detalhe do erro:", repr(e))


→ Tentando conectar…
Conectado! -> ('olist', 'public', 'PostgreSQL 16.10 (Debian 16.10-1.pgdg13+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 14.2.0-19) 14.2.0, 64-bit')
→ Executando DDL de Olist-script-postgres.sql (tamanho ~3081 bytes)…

[AVISO] Não foi possível conectar ao Postgres agora.
→ Quando o Levi subir o docker-compose, esta célula passa a funcionar sem mudar nada.
Detalhe do erro: ProgrammingError('(psycopg2.errors.DuplicateTable) relation "idx_fatoitenspedido_product_id" already exists\n')


## Load para Silver (to_sql)

In [29]:
# 🔁 2.6.2 — Load para Silver (to_sql) — CORRIGIDO

import pandas as pd

expected_vars = ["customers","geos","orders","items","payments","reviews","products","sellers","prod_trans"]
missing = [v for v in expected_vars if v not in globals()]
assert not missing, f"Dataframes faltando da etapa Extract: {missing}"

def to_silver(df: pd.DataFrame, table_name: str):
    try:
        with engine.begin() as conn:
            # Não use conn.connection aqui! Passe o próprio `conn` (SQLAlchemy Connection)
            df.to_sql(
                name=table_name,
                con=conn,                # 👈 o pulo do gato
                schema=DB_SCHEMA,        # garante criar/carregar no schema "silver"
                if_exists="append",      # 1ª carga cria a tabela; reexecuções fazem append
                index=False,
                method="multi",
                chunksize=10_000
            )
            print(f"✅ Carregado: {table_name} -> {len(df):,} linhas")
    except Exception as e:
        print(f"❌ Falha ao carregar {table_name}: {e}")

tables_map = {
    "customers": customers,
    "geolocation": geos,
    "orders": orders,
    "order_items": items,
    "order_payments": payments,
    "order_reviews": reviews,
    "products": products,
    "sellers": sellers,
    "product_category_name_translation": prod_trans,
}

for tname, df in tables_map.items():
    to_silver(df, tname)


✅ Carregado: customers -> 99,441 linhas
✅ Carregado: geolocation -> 1,000,163 linhas
✅ Carregado: orders -> 99,441 linhas
✅ Carregado: order_items -> 112,650 linhas
✅ Carregado: order_payments -> 103,886 linhas
✅ Carregado: order_reviews -> 99,224 linhas
✅ Carregado: products -> 32,951 linhas
✅ Carregado: sellers -> 3,095 linhas
✅ Carregado: product_category_name_translation -> 71 linhas


## 2.6.3 — Índices Silver

In [30]:
# 2.6.3 — Criar índices estratégicos

index_sql = f"""
SET search_path TO "{DB_SCHEMA}", public;

-- Chaves de busca frequentes
CREATE INDEX IF NOT EXISTS idx_orders_order_id        ON orders(order_id);
CREATE INDEX IF NOT EXISTS idx_orders_customer_id     ON orders(customer_id);

CREATE INDEX IF NOT EXISTS idx_items_order_id         ON order_items(order_id);
CREATE INDEX IF NOT EXISTS idx_items_product_id       ON order_items(product_id);
CREATE INDEX IF NOT EXISTS idx_items_seller_id        ON order_items(seller_id);

CREATE INDEX IF NOT EXISTS idx_payments_order_id      ON order_payments(order_id);
CREATE INDEX IF NOT EXISTS idx_reviews_order_id       ON order_reviews(order_id);

CREATE INDEX IF NOT EXISTS idx_products_product_id    ON products(product_id);
CREATE INDEX IF NOT EXISTS idx_products_category_name ON products(product_category_name);

CREATE INDEX IF NOT EXISTS idx_customers_customer_id  ON customers(customer_id);
CREATE INDEX IF NOT EXISTS idx_sellers_seller_id      ON sellers(seller_id);
"""

try:
    with engine.begin() as conn:
        conn.exec_driver_sql(index_sql)
        print("✅ Índices criados/garantidos.")
except Exception as e:
    print("❌ Erro criando índices:", e)


✅ Índices criados/garantidos.


In [31]:
# 2.6.4 — Validações: contagens e amostras

from sqlalchemy import text
import pandas as pd

checks = [
    ("customers",            "customer_id"),
    ("orders",               "order_id"),
    ("order_items",          "order_id"),
    ("order_payments",       "order_id"),
    ("order_reviews",        "order_id"),
    ("products",             "product_id"),
    ("sellers",              "seller_id"),
    ("geolocation",          None),
    ("product_category_name_translation", None),
]

try:
    with engine.begin() as conn:
        conn.exec_driver_sql(f'SET search_path TO "{DB_SCHEMA}", public;')
        for table, pk in checks:
            cnt = conn.exec_driver_sql(f'SELECT COUNT(*) FROM "{table}";').scalar_one()
            print(f'🔎 {table:<35} -> {cnt:,} linhas')
            # mostra 5 linhas
            q = f'SELECT * FROM "{table}" LIMIT 5;'
            df = pd.read_sql_query(text(q), conn)
            display(df)
except Exception as e:
    print("⚠️ Não consegui validar agora (provável DB indisponível). Detalhe:", e)


🔎 customers                           -> 198,882 linhas


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


🔎 orders                              -> 198,882 linhas


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26


🔎 order_items                         -> 225,300 linhas


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14


🔎 order_payments                      -> 207,772 linhas


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


🔎 order_reviews                       -> 198,448 linhas


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53


🔎 products                            -> 65,902 linhas


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40,287,1,225,16,10,14
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44,276,1,1000,30,18,20
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46,250,1,154,18,9,15
3,cef67bcfe19066a932b7673e239eb23d,bebes,27,261,1,371,26,4,26
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37,402,4,625,20,17,13


🔎 sellers                             -> 6,190 linhas


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


🔎 geolocation                         -> 2,000,326 linhas


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


🔎 product_category_name_translation   -> 142 linhas


Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


In [32]:
# 8A.1 — Conexão (reusa .env com PGPORT=5434)
import os
from sqlalchemy import create_engine, text

DB_HOST   = os.getenv("PGHOST", "localhost")
DB_PORT   = os.getenv("PGPORT", "5434")  # <- porta nova
DB_NAME   = os.getenv("PGDATABASE", "olist")
DB_USER   = os.getenv("PGUSER", "postgres")
DB_PASS   = os.getenv("PGPASSWORD", "postgres")

engine = create_engine(
    f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}",
    future=True, pool_pre_ping=True
)

def exec_sql(sql: str):
    with engine.begin() as conn:
        conn.exec_driver_sql(sql)
    print("OK")


In [33]:
# 8A.2 — Criar schema gold e views
views_sql = """
CREATE SCHEMA IF NOT EXISTS gold;

-- 1) Receita por pedido (base para agregações)
CREATE OR REPLACE VIEW gold.vw_revenue_per_order AS
SELECT op.order_id,
       SUM(op.payment_value) AS revenue
FROM   silver.order_payments op
GROUP  BY op.order_id;

-- 2) Séries mensais: pedidos, receita bruta e ticket médio
CREATE OR REPLACE VIEW gold.vw_orders_monthly AS
SELECT DATE_TRUNC('month', o.order_purchase_timestamp)::date AS month,
       COUNT(*)                                   AS orders_count,
       SUM(COALESCE(r.revenue,0))                 AS gross_revenue,
       AVG(COALESCE(r.revenue,0))                 AS aov
FROM   silver.orders o
LEFT   JOIN gold.vw_revenue_per_order r USING (order_id)
GROUP  BY 1
ORDER  BY 1;

-- 3) Vendas por categoria (usa preço dos itens como proxy de receita)
CREATE OR REPLACE VIEW gold.vw_category_sales AS
SELECT COALESCE(t.product_category_name_english, p.product_category_name) AS category,
       SUM(oi.price)        AS sales_amount,
       SUM(oi.freight_value) AS freight_amount,
       COUNT(DISTINCT oi.order_id) AS orders
FROM   silver.order_items oi
JOIN   silver.products p USING (product_id)
LEFT   JOIN silver.product_category_name_translation t
       ON t.product_category_name = p.product_category_name
GROUP  BY 1
ORDER  BY sales_amount DESC;

-- 4) Mix de pagamento por mês
CREATE OR REPLACE VIEW gold.vw_payment_mix_monthly AS
SELECT DATE_TRUNC('month', o.order_purchase_timestamp)::date AS month,
       op.payment_type,
       SUM(op.payment_value) AS payment_value,
       COUNT(*)              AS payments
FROM   silver.orders o
JOIN   silver.order_payments op USING (order_id)
GROUP  BY 1,2
ORDER  BY 1,2;

-- 5) SLA de entrega (pontualidade mensal)
CREATE OR REPLACE VIEW gold.vw_delivery_sla AS
SELECT o.order_id,
       o.order_purchase_timestamp::date   AS purchase_date,
       o.order_estimated_delivery_date::date AS estimated_date,
       o.order_delivered_customer_date::date AS delivered_date,
       (o.order_delivered_customer_date - o.order_purchase_timestamp)       AS actual_lead_time,
       (o.order_estimated_delivery_date - o.order_purchase_timestamp)       AS estimated_lead_time,
       CASE WHEN o.order_delivered_customer_date IS NOT NULL
                 AND o.order_delivered_customer_date::date <= o.order_estimated_delivery_date::date
            THEN 1 ELSE 0 END AS on_time
FROM   silver.orders o
WHERE  o.order_status IN ('delivered','shipped','invoiced','processing');

CREATE OR REPLACE VIEW gold.vw_delivery_sla_monthly AS
SELECT DATE_TRUNC('month', purchase_date)::date AS month,
       COUNT(*)                                 AS delivered_orders,
       SUM(on_time)::int                        AS on_time_orders,
       ROUND(100.0*SUM(on_time)/NULLIF(COUNT(*),0), 2) AS on_time_rate
FROM   gold.vw_delivery_sla
WHERE  delivered_date IS NOT NULL
GROUP  BY 1
ORDER  BY 1;
"""
exec_sql(views_sql)


OK
