# **ETL da raw para a silver**

Esse notebook ira realisar o ETL, um processo de tr√™s etapas ‚Äî Extrair, Transformar e Carregar, usado para integrar dados de diferentes fontes em um √∫nico dat warehouse. 
Essa metodologia combina dados, limpando-os e organizando-os para an√°lise, relat√≥rios e tomada de decis√µes de neg√≥cios. 




A c√©lula abaixo instala as bibliotecas Python necess√°rias (como Pandas para dados e SQLAlchemy para banco de dados) no ambiente do notebook.

In [107]:
%pip install -q pandas sqlalchemy psycopg2-binary python-dotenv pyarrow tqdm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


#### Importando as Bibliotecas necessarias

In [108]:
import pandas as pd
from sqlalchemy import create_engine, text
from tqdm import tqdm
from pathlib import Path
import os

A c√©lula abaixo encontra a pasta raiz do projeto buscando por data_layer, define os caminhos importantes (raw, sql), e verifica se os dados brutos (raw e o arquivo de pedidos) existem e podem ser lidos, exibindo as 5 primeiras linhas como prova.

In [109]:
# 1) Detectar automaticamente a raiz que cont√©m "data_layer"
CWD = Path.cwd()
PROJECT_ROOT = None
for candidate in [CWD, *CWD.parents]:
    if (candidate / "data_layer").exists():
        PROJECT_ROOT = candidate
        break

if PROJECT_ROOT is None:
    raise FileNotFoundError(
        f'N√£o achei a pasta "data_layer" a partir de {CWD}. '
        f'Abra o notebook a partir do reposit√≥rio ou mova este .ipynb para dentro dele.'
    )

# 2) Recalcular caminhos com base na raiz correta
RAW_DIR = PROJECT_ROOT / "data_layer" /  "raw"
# CORRE√á√ÉO AQUI:
DDL_PATH = PROJECT_ROOT / "data_layer" / "silver" / "DDL.sql"

print("CWD:", CWD)
print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR)
print("DDL_PATH:", DDL_PATH)

# 3) Validar que CSVs existem e sao kegiveis
assert RAW_DIR.exists(), f"Pasta Bronze (raw) n√£o encontrada: {RAW_DIR}"

# 4) Checagem m√≠nima: tentar ler 5 linhas do orders
orders_csv = RAW_DIR / "olist_orders_dataset.csv"
assert orders_csv.exists(), f"Arquivo esperado n√£o encontrado: {orders_csv}"
display(pd.read_csv(orders_csv, nrows=5).head())
print("‚úÖ Bronze encontrada e leg√≠vel.")


CWD: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis\Transfomer
PROJECT_ROOT: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis
RAW_DIR: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis\data_layer\raw
DDL_PATH: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis\data_layer\silver\DDL.sql


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


‚úÖ Bronze encontrada e leg√≠vel.


# EXTRACT ‚Äî Ler dados da Bronze

- Carrega todos os CSVs de `olist-ecommerce-pipeline/data/raw`.
- Define dtypes expl√≠citos e faz *parse* de datas.
- Exibe *shape*, tipos e amostra de linhas para valida√ß√£o.


### DB Config & Connection Test

_L√™ vari√°veis do`` =.env`` (se existir) ou usa defaults locais.
Testa conex√£o, cria o schema ``silver`` (se n√£o existir) e ajusta ``search_path``.
N√£o falha o notebook se o Postgres n√£o estiver no ar apenas avisa._

In [110]:
# 1) Carregar .env se existir (opcional)
ENV_PATH = PROJECT_ROOT / ".env"
if ENV_PATH.exists():
    # carregamento leve do .env (sem depend√™ncias)
    with open(ENV_PATH, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#") or "=" not in line:
                continue
            k, v = line.split("=", 1)
            os.environ.setdefault(k.strip(), v.strip())

# 2) Vari√°veis de conex√£o (use as que o Levi definir; estas s√£o defaults comuns)
DB_HOST   = os.getenv("PGHOST", "localhost")
DB_PORT   = os.getenv("PGPORT", "5432")
DB_NAME   = os.getenv("PGDATABASE", "olist")
DB_USER   = os.getenv("PGUSER", "postgres")
DB_PASS   = os.getenv("PGPASSWORD", "postgres")
DB_SCHEMA = os.getenv("PGSCHEMA", "silver")   # camada Silver

print("DB_HOST:", DB_HOST, "| DB_PORT:", DB_PORT, "| DB_NAME:", DB_NAME, "| SCHEMA:", DB_SCHEMA)

# 3) Criar engine SQLAlchemy
db_url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(db_url, pool_pre_ping=True, future=True)

# 4) Teste de conex√£o e cria√ß√£o do schema
try:
    with engine.begin() as conn:
        row = conn.exec_driver_sql("select current_database(), current_schema(), version();").fetchone()
        print("Conectado! ->", row)

        # cria schema silver se n√£o existir e seta o search_path
        conn.exec_driver_sql(f'CREATE SCHEMA IF NOT EXISTS "{DB_SCHEMA}";')
        conn.exec_driver_sql(f'SET search_path TO "{DB_SCHEMA}", public;')

        # checar DDL
        if DDL_PATH.exists():
            print(f"DDL localizado em: {DDL_PATH} (tamanho ~{DDL_PATH.stat().st_size} bytes)")
        else:
            print("[AVISO] DDL n√£o encontrado ainda ‚Äî ok, seguimos com ETL e executamos depois.")

except Exception as e:
    print("\n[AVISO] N√£o foi poss√≠vel conectar ao Postgres agora.")
    print("‚Üí Motivos comuns: container n√£o iniciado ou credenciais/DB diferentes.")
    print("‚Üí Quando o Levi subir o docker-compose, esta c√©lula deve funcionar.")
    print("Detalhe do erro:", repr(e))

DB_HOST: localhost | DB_PORT: 5432 | DB_NAME: olist | SCHEMA: silver

[AVISO] N√£o foi poss√≠vel conectar ao Postgres agora.
‚Üí Motivos comuns: container n√£o iniciado ou credenciais/DB diferentes.
‚Üí Quando o Levi subir o docker-compose, esta c√©lula deve funcionar.
Detalhe do erro: OperationalError('(psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: Connection refused (0x0000274D/10061)\n\tIs the server running on that host and accepting TCP/IP connections?\nconnection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused (0x0000274D/10061)\n\tIs the server running on that host and accepting TCP/IP connections?\n')


A c√©lula abaixo, define uma fun√ß√£o que verifica a exist√™ncia de um arquivo CSV, carrega seu conte√∫do da pasta bruta (Bronze) para a mem√≥ria aplicando tipagens espec√≠ficas, e, em seguida, executa um loop para extrair todos os arquivos listados ``(CSV_LIST)``, criando DataFrames em vari√°veis de acesso r√°pido.

In [111]:
def load_csv(filename: str, dtypes=None, parse_dates=None):
    """Carrega um CSV da Bronze, padroniza nomes das colunas (lower) e retorna DataFrame."""
    # 1. Encontra e verifica o caminho do arquivo no disco
    path = RAW_DIR / filename
    if not path.exists():
        raise FileNotFoundError(f"CSV n√£o encontrado: {path}")

    # 2. L√ä o arquivo do disco para um DataFrame do Pandas (Extra√ß√£o Central)
    df = pd.read_csv(
        path,
        dtype=dtypes or {},
        parse_dates=parse_dates or [],
        keep_default_na=True,
        encoding="utf-8",
        # A linha abaixo foi removida ou comentada intencionalmente para corrigir as multiplas barras de progresso
        # infer_datetime_format=True, 
        low_memory=False,
    )
    # df.columns = [c.strip().lower() for c in df.columns] # <-- T: Transforma√ß√£o (Padroniza√ß√£o)
    return df

# ---------- load all ----------
dfs = {}
# 3. Itera sobre a lista e chama a fun√ß√£o de leitura para extrair todos
for name in tqdm(CSV_LIST, desc="Lendo Bronze"):
    dfs[name] = load_csv(
        name,
        dtypes=DTYPES.get(name),
        parse_dates=PARSE_DATES.get(name),
    )

# 4. Cria aliases para os DataFrames extra√≠dos
customers   = dfs["olist_customers_dataset.csv"]
geos        = dfs["olist_geolocation_dataset.csv"]
orders      = dfs["olist_orders_dataset.csv"]
items       = dfs["olist_order_items_dataset.csv"]
payments    = dfs["olist_order_payments_dataset.csv"]
reviews     = dfs["olist_order_reviews_dataset.csv"]
products    = dfs["olist_products_dataset.csv"]
sellers     = dfs["olist_sellers_dataset.csv"]
prod_trans  = dfs["product_category_name_translation.csv"]

Lendo Bronze: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:02<00:00,  3.39it/s]


# TRANSFORM - Normaliza√ß√£o para a Camada Silver

- Padroniza strings, remove nulos cr√≠ticos e duplicidades.
- Enriquece `products` com tradu√ß√£o de categoria.
- Garante integridade referencial: `items`, `payments` e `reviews` s√≥ com `order_id` v√°lido; `items` s√≥ com `product_id`/`seller_id` v√°lidos.
- Agrega/geolocaliza√ß√£o deduplicada por CEP prefixo.
- Deriva campos √∫teis em `orders` (datas e m√©tricas de entrega).
- Cria dataframes finais: `silver_customers`, `silver_orders`, `silver_order_items`, `silver_products`, `silver_sellers`, `silver_payments`, `silver_reviews`, `silver_geolocation`.


Definindo a fun√ß√£o ``load_csv`` para leitura e padroniza√ß√£o, e configura os dicion√°rios ``(DTYPES, PARSE_DATES)`` que especificam os tipos de dados e colunas de data esperados para cada arquivo CSV, preparando as regras de transforma√ß√£o.

In [112]:
def load_csv(filename: str, dtypes=None, parse_dates=None):
    """Carrega um CSV da Bronze, padroniza nomes das colunas (lower) e retorna DataFrame."""
    path = RAW_DIR / filename
    if not path.exists():
        raise FileNotFoundError(f"CSV n√£o encontrado: {path}")
    df = pd.read_csv(
        path,
        dtype=dtypes or {},
        parse_dates=parse_dates or [],
        keep_default_na=True,
        encoding="utf-8",
        infer_datetime_format=True,
        low_memory=False,
    )
    df.columns = [c.strip().lower() for c in df.columns]
    return df


In [113]:
# dtypes por arquivo (usando tipos que aceitam NA quando necess√°rio)
Int = "Int64"  # inteiro com suporte a NA
Str = "string"

DTYPES = {
    "olist_orders_dataset.csv": {
        "order_id": Str,
        "customer_id": Str,
        "order_status": Str,
        # timestamps lidos via parse_dates
    },
    "olist_order_items_dataset.csv": {
        "order_id": Str,
        "order_item_id": Int,
        "product_id": Str,
        "seller_id": Str,
        "price": "float64",
        "freight_value": "float64",
        # shipping_limit_date via parse_dates
    },
    "olist_order_payments_dataset.csv": {
        "order_id": Str,
        "payment_sequential": Int,
        "payment_type": Str,
        "payment_installments": Int,
        "payment_value": "float64",
    },
    "olist_order_reviews_dataset.csv": {
        "review_id": Str,
        "order_id": Str,
        "review_score": Int,
        "review_comment_title": Str,
        "review_comment_message": Str,
        # creation/answer via parse_dates
    },
    "olist_products_dataset.csv": {
        "product_id": Str,
        "product_category_name": Str,
        "product_name_lenght": Int,
        "product_description_lenght": Int,
        "product_photos_qty": Int,
        "product_weight_g": Int,
        "product_length_cm": Int,
        "product_height_cm": Int,
        "product_width_cm": Int,
    },
    "olist_sellers_dataset.csv": {
        "seller_id": Str,
        "seller_zip_code_prefix": Int,
        "seller_city": Str,
        "seller_state": Str,
    },
    "olist_customers_dataset.csv": {
        "customer_id": Str,
        "customer_unique_id": Str,
        "customer_zip_code_prefix": Int,
        "customer_city": Str,
        "customer_state": Str,
    },
    "olist_geolocation_dataset.csv": {
        "geolocation_zip_code_prefix": Int,
        "geolocation_lat": "float64",
        "geolocation_lng": "float64",
        "geolocation_city": Str,
        "geolocation_state": Str,
    },
    "product_category_name_translation.csv": {
        "product_category_name": Str,
        "product_category_name_english": Str,
    },
}

PARSE_DATES = {
    "olist_orders_dataset.csv": [
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ],
    "olist_order_items_dataset.csv": [
        "shipping_limit_date",
    ],
    "olist_order_reviews_dataset.csv": [
        "review_creation_date",
        "review_answer_timestamp",
    ],
}


Exibindo o n√∫mero de linhas e colunas (shape) de todos os DataFrames carregados e imprime o detalhe dos tipos de dados (dtypes) do DataFrame de orders, fornecendo uma vis√£o r√°pida da estrutura.

In [114]:
# ---------- quick summary: Shapes ----------
print("\n# Shapes")
for k, v in dfs.items():
    print(f"{k:40s} -> {v.shape}")

print("\n# Dtypes (orders)")
print(orders.dtypes)


# Shapes
olist_customers_dataset.csv              -> (99441, 5)
olist_geolocation_dataset.csv            -> (1000163, 5)
olist_orders_dataset.csv                 -> (99441, 8)
olist_order_items_dataset.csv            -> (112650, 7)
olist_order_payments_dataset.csv         -> (103886, 5)
olist_order_reviews_dataset.csv          -> (99224, 7)
olist_products_dataset.csv               -> (32951, 9)
olist_sellers_dataset.csv                -> (3095, 4)
product_category_name_translation.csv    -> (71, 2)

# Dtypes (orders)
order_id                         string[python]
customer_id                      string[python]
order_status                     string[python]
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
dtype: object


Exibe as tr√™s primeiras linhas (head(3)) de alguns DataFrames principais (orders, items, payments) para uma inspe√ß√£o visual r√°pida e valida√ß√£o dos dados.

In [115]:
# ---------- quick summary: Amostras ----------
print("\n# Amostras")
display(orders.head(3))
display(items.head(3))
display(payments.head(3))


# Amostras


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71


Define e executa a fun√ß√£o ``na_overview`` para calcular e imprimir a contagem de valores ausentes (NAs) nas colunas dos DataFrames principais, identificando onde a limpeza de dados (Transforma√ß√£o) ser√° necess√°ria.

In [116]:
# NAs por coluna (vis√£o geral) ‚Äî √∫til para planejar a transforma√ß√£o
def na_overview(df, name):
    s = df.isna().sum()
    if (s > 0).any():
        print(f"\nNA overview ‚Äî {name}")
        print(s[s > 0].sort_values(ascending=False).head(12))

na_overview(orders, "orders")
na_overview(items, "items")
na_overview(products, "products")
na_overview(reviews, "reviews")


NA overview ‚Äî orders
order_delivered_customer_date    2965
order_delivered_carrier_date     1783
order_approved_at                 160
dtype: int64

NA overview ‚Äî products
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

NA overview ‚Äî reviews
review_comment_title      87656
review_comment_message    58247
dtype: int64


### Fun√ß√µes Auxiliares de Limpeza (Transforma√ß√£o)
Define tr√™s fun√ß√µes auxiliares essenciais para a fase de Transforma√ß√£o: norm_str (padroniza√ß√£o de texto), drop_nulls (remo√ß√£o de linhas com valores nulos) e drop_dups (remo√ß√£o de linhas duplicadas, reportando a contagem de linhas afetadas).

In [117]:
def norm_str(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = (
                df[c]
                .astype("string")
                .str.strip()
            )

def drop_nulls(df, cols, name):
    before = len(df)
    df2 = df.dropna(subset=[c for c in cols if c in df.columns])
    removed = before - len(df2)
    if removed:
        print(f"[{name}] removidas {removed} linhas por nulos em {cols}")
    return df2

def drop_dups(df, keys, name):
    before = len(df)
    df2 = df.drop_duplicates(subset=keys, keep="first")
    removed = before - len(df2)
    if removed:
        print(f"[{name}] removidas {removed} duplicatas por {keys}")
    return df2


### Limpeza e Transforma√ß√£o das Dimens√µes Mestre
Aplica as fun√ß√µes de limpeza aos DataFrames de customers, sellers e geolocation (tabelas mestre/dimens√£o), garantindo que as chaves prim√°rias n√£o contenham nulos ou duplicatas, e realiza o merge para traduzir a categoria de products.

In [118]:
# ---------- Customers, Sellers, Geolocation (Limpeza B√°sica) ----------
silver_customers = customers.copy()
norm_str(silver_customers, ["customer_id","customer_unique_id","customer_city","customer_state"])
silver_customers = drop_nulls(silver_customers, ["customer_id"], "customers")
silver_customers = drop_dups(silver_customers, ["customer_id"], "customers")

silver_sellers = sellers.copy()
norm_str(silver_sellers, ["seller_id","seller_city","seller_state"])
silver_sellers = drop_nulls(silver_sellers, ["seller_id"], "sellers")
silver_sellers = drop_dups(silver_sellers, ["seller_id"], "sellers")

silver_geolocation = geos.copy()
silver_geolocation = drop_nulls(silver_geolocation, ["geolocation_zip_code_prefix"], "geolocation")
silver_geolocation = drop_dups(silver_geolocation, ["geolocation_zip_code_prefix"], "geolocation")

# ---------- Products (+ tradu√ß√£o da categoria) ----------
silver_products = products.copy()
norm_str(silver_products, ["product_id","product_category_name"])
silver_products = silver_products.merge(
    prod_trans, on="product_category_name", how="left"
)
silver_products = silver_products.rename(
    columns={"product_category_name_english": "product_category_en"}
)
silver_products = drop_nulls(silver_products, ["product_id"], "products")
silver_products = drop_dups(silver_products, ["product_id"], "products")

[geolocation] removidas 981148 duplicatas por ['geolocation_zip_code_prefix']


### Transforma√ß√£o e Deriva√ß√£o de Pedidos (Orders)
Limpa a tabela de orders e calcula m√©tricas importantes de tempo de entrega, como o tempo total de delivery, o atraso em rela√ß√£o √† estimativa e uma flag bin√°ria para indicar se o pedido foi entregue com atraso.

In [119]:
# ---------- Orders (Limpeza e Derivados de Entrega) ----------
silver_orders = orders.copy()
norm_str(silver_orders, ["order_id","customer_id","order_status"])
silver_orders = drop_nulls(silver_orders, ["order_id","customer_id"], "orders")
silver_orders = drop_dups(silver_orders, ["order_id"], "orders")

# derivados √∫teis
silver_orders["order_purchase_date"] = silver_orders["order_purchase_timestamp"].dt.date
silver_orders["delivery_time_days"] = (
    silver_orders["order_delivered_customer_date"] - silver_orders["order_purchase_timestamp"]
).dt.days.astype("Int64")

silver_orders["delivery_delay_days"] = (
    silver_orders["order_delivered_customer_date"] - silver_orders["order_estimated_delivery_date"]
).dt.days.astype("Int64")

silver_orders["delivered_late"] = (
    (silver_orders["order_delivered_customer_date"] > silver_orders["order_estimated_delivery_date"])
).astype("Int64")

### Integridade Referencial e Limpeza de Transa√ß√µes
Garante a integridade dos dados ao filtrar as tabelas transacionais (items, payments, reviews), removendo registros que n√£o possuem chaves v√°lidas nas tabelas mestre rec√©m-limpas (Silver), e aplica a limpeza final de nulos e duplicatas nessas tabelas.

In [120]:
# ---------- Itens, Pagamentos, Avalia√ß√µes (Integridade Referencial) ----------
valid_orders = set(silver_orders["order_id"])
valid_products = set(silver_products["product_id"])
valid_sellers  = set(silver_sellers["seller_id"])

# Items
silver_order_items = items.copy()
norm_str(silver_order_items, ["order_id","product_id","seller_id"])
silver_order_items = drop_nulls(silver_order_items, ["order_id","order_item_id","product_id","seller_id"], "order_items")
silver_order_items = silver_order_items[
    silver_order_items["order_id"].isin(valid_orders)
    & silver_order_items["product_id"].isin(valid_products)
    & silver_order_items["seller_id"].isin(valid_sellers)
].copy()
silver_order_items = drop_dups(silver_order_items, ["order_id","order_item_id"], "order_items")

# Payments
silver_payments = payments.copy()
norm_str(silver_payments, ["order_id","payment_type"])
silver_payments = drop_nulls(silver_payments, ["order_id"], "payments")
silver_payments = silver_payments[silver_payments["order_id"].isin(valid_orders)].copy()

# Reviews
silver_reviews = reviews.copy()
norm_str(silver_reviews, ["review_id","order_id"])
silver_reviews = drop_nulls(silver_reviews, ["review_id","order_id"], "reviews")
silver_reviews = silver_reviews[silver_reviews["order_id"].isin(valid_orders)].copy()
silver_reviews = drop_dups(silver_reviews, ["review_id"], "reviews")

[reviews] removidas 814 duplicatas por ['review_id']


### Sanity Check Final da Camada Silver
Exibe um resumo final da estrutura (shape) e amostras dos DataFrames rec√©m-criados e limpos da camada Silver, confirmando que a Transforma√ß√£o foi conclu√≠da com sucesso e os dados est√£o prontos para a fase de Carregamento.

In [121]:
# Crie o dicion√°rio de resumo dos shapes
summary = {
    "silver_customers": silver_customers.shape,
    "silver_sellers": silver_sellers.shape,
    "silver_products": silver_products.shape,
    "silver_geolocation": silver_geolocation.shape,
    "silver_orders": silver_orders.shape,
    "silver_order_items": silver_order_items.shape,
    "silver_payments": silver_payments.shape,
    "silver_reviews": silver_reviews.shape,
}

# Converte o dicion√°rio em um DataFrame para exibi√ß√£o tabular
summary_df = pd.DataFrame(
    summary.values(), 
    index=summary.keys(), 
    columns=["Linhas", "Colunas"]
)

# T√≠tulo do Resumo
print("‚úÖ Resumo de Shapes da Camada Silver:")

# Exibe o DataFrame de resumo
display(summary_df)

‚úÖ Resumo de Shapes da Camada Silver:


Unnamed: 0,Linhas,Colunas
silver_customers,99441,5
silver_sellers,3095,4
silver_products,32951,10
silver_geolocation,19015,5
silver_orders,99441,12
silver_order_items,112650,7
silver_payments,103886,5
silver_reviews,98410,7


In [122]:
print("\n--- Amostras da Camada Silver ---")

print("\nsilver_orders (Pedidos):")
display(silver_orders.head(3))

print("\nsilver_order_items (Itens de Pedido):")
display(silver_order_items.head(3))

print("\nsilver_products (Produtos):")
display(silver_products.head(3))


--- Amostras da Camada Silver ---

silver_orders (Pedidos):


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_purchase_date,delivery_time_days,delivery_delay_days,delivered_late
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,2017-10-02,8,-8,0
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,2018-07-24,13,-6,0
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,2018-08-08,9,-18,0



silver_order_items (Itens de Pedido):


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87



silver_products (Produtos):


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_en
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40,287,1,225,16,10,14,perfumery
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44,276,1,1000,30,18,20,art
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46,250,1,154,18,9,15,sports_leisure


# LOAD


## Conectar e aplicar o DDL

Esta c√©lula configura a conex√£o com o banco de dados PostgreSQL lendo credenciais de um arquivo .env ou usando defaults, tenta se conectar, garante que o esquema silver exista e, se o arquivo DDL for encontrado, executa os comandos SQL para criar a estrutura de tabelas, preparando o banco para receber os dados limpos.

In [123]:
# 2.6.1 ‚Äî Conectar no Postgres e aplicar o DDL (Silver)
# Reusa PROJECT_ROOT, DDL_PATH, etc. j√° definidos antes
ENV_PATH = PROJECT_ROOT / ".env"
if ENV_PATH.exists():
    for line in ENV_PATH.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        k, v = line.split("=", 1)
        os.environ.setdefault(k.strip(), v.strip())

DB_HOST   = os.getenv("PGHOST", "localhost")
DB_PORT   = os.getenv("PGPORT", "5435")
DB_NAME   = os.getenv("PGDATABASE", "olist")
DB_USER   = os.getenv("PGUSER", "postgres")
DB_PASS   = os.getenv("PGPASSWORD", "postgres")
DB_SCHEMA = os.getenv("PGSCHEMA", "silver")   # camada Silver

db_url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(db_url, pool_pre_ping=True, future=True)

print("‚Üí Tentando conectar‚Ä¶")
try:
    with engine.begin() as conn:
        row = conn.exec_driver_sql("select current_database(), current_schema(), version();").fetchone()
        print("Conectado! ->", row)

        # Garante o schema Silver e seta o search_path
        conn.exec_driver_sql(f'CREATE SCHEMA IF NOT EXISTS "{DB_SCHEMA}";')
        conn.exec_driver_sql(f'SET search_path TO "{DB_SCHEMA}", public;')

        # Executa o DDL do Pablo (se existir)
        if DDL_PATH.exists():
            sql = DDL_PATH.read_text(encoding="utf-8").strip()
            print(f"‚Üí Executando DDL de {DDL_PATH.name} (tamanho ~{DDL_PATH.stat().st_size} bytes)‚Ä¶")
            conn.exec_driver_sql(sql)
            print("‚úÖ DDL aplicado.")
        else:
            print("‚ö†Ô∏è DDL n√£o encontrado ‚Äî seguiremos com to_sql para criar as tabelas.")

except Exception as e:
    print("\n[AVISO] N√£o foi poss√≠vel conectar ao Postgres agora.")


‚Üí Tentando conectar‚Ä¶

[AVISO] N√£o foi poss√≠vel conectar ao Postgres agora.


In [124]:
# 2.5 ‚Äî Export: treated_dataset.csv (Silver)

# Define Silver directory (same folder where your DDL lives)
SILVER_DIR = PROJECT_ROOT / "data_layer" / "silver"
SILVER_DIR.mkdir(parents=True, exist_ok=True)

# Payments aggregated per order (total value; you can add more metrics if needed)
payments_agg = (
    silver_payments
    .groupby("order_id", as_index=False)
    .agg(total_payment_value=("payment_value", "sum"))
)

# Build a row-per-item "treated" dataset by joining normalized Silver tables
treated = (
    silver_order_items
    .merge(
        silver_orders[[
            "order_id","customer_id","order_status","order_purchase_timestamp",
            "order_purchase_date","delivery_time_days","delivery_delay_days","delivered_late"
        ]],
        on="order_id", how="left"
    )
    .merge(
        silver_products[[
            "product_id","product_category_name","product_category_en",
            "product_weight_g","product_length_cm","product_height_cm","product_width_cm","product_photos_qty"
        ]],
        on="product_id", how="left"
    )
    .merge(
        silver_customers[[
            "customer_id","customer_city","customer_state","customer_zip_code_prefix"
        ]],
        on="customer_id", how="left"
    )
    .merge(
        silver_sellers[[
            "seller_id","seller_city","seller_state","seller_zip_code_prefix"
        ]],
        on="seller_id", how="left"
    )
    .merge(
        payments_agg, on="order_id", how="left"
    )
)

# Optional: friendly column order
cols = [
    # order / time
    "order_id","order_status","order_purchase_timestamp","order_purchase_date",
    "delivery_time_days","delivery_delay_days","delivered_late",
    # customer
    "customer_id","customer_city","customer_state","customer_zip_code_prefix",
    # item
    "order_item_id","product_id","seller_id","price","freight_value","shipping_limit_date",
    # product
    "product_category_en","product_category_name","product_photos_qty",
    "product_weight_g","product_length_cm","product_height_cm","product_width_cm",
    # seller
    "seller_city","seller_state","seller_zip_code_prefix",
    # payments
    "total_payment_value",
]
treated = treated[[c for c in cols if c in treated.columns]].copy()

# Save CSV to Silver
treated_path = SILVER_DIR / "silver_treated_dataset.csv"
treated.to_csv(treated_path, index=False, encoding="utf-8")
print(f"‚úÖ silver_treated_dataset.csv salvo em: {treated_path}  ({len(treated):,} linhas)")


‚úÖ silver_treated_dataset.csv salvo em: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis\data_layer\silver\silver_treated_dataset.csv  (112,650 linhas)


## Load para Silver (to_sql)

In [130]:
# üîÅ 2.6.2 ‚Äî Load para Silver (to_sql) ‚Äî CORRIGIDO

expected_vars = ["customers","geos","orders","items","payments","reviews","products","sellers","prod_trans"]
missing = [v for v in expected_vars if v not in globals()]
assert not missing, f"Dataframes faltando da etapa Extract: {missing}"

def to_silver(df: pd.DataFrame, table_name: str):
    try:
        with engine.begin() as conn:
            # N√£o use conn.connection aqui! Passe o pr√≥prio `conn` (SQLAlchemy Connection)
            df.to_sql(
                name=table_name,
                con=conn,                
                schema=DB_SCHEMA,       
                if_exists="append",      
                index=False,
                method="multi",
                chunksize=10_000
            )
            print(f"‚úÖ Carregado: {table_name} -> {len(df):,} linhas")
    except Exception as e:
        print(f"‚ùå Falha ao carregar {table_name}: {e}")

tables_map = {
    "customers": customers,
    "geolocation": geos,
    "orders": orders,
    "order_items": items,
    "order_payments": payments,
    "order_reviews": reviews,
    "products": products,
    "sellers": sellers,
    "product_category_name_translation": prod_trans,
}

for tname, df in tables_map.items():
    to_silver(df, tname)


‚ö†Ô∏è DB n√£o dispon√≠vel ‚Äî salvo em CSV de fallback: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis\data_layer\silver\customers_backup.csv (99,441 linhas). Erro original: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5434 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5434 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)
‚ö†Ô∏è DB n√£o dispon√≠vel ‚Äî salvo em CSV de fallback: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis\data_layer\silver\geolocation_backup.csv (1,000,163 linhas). Erro original: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5434 failed: Connection refused (0x0000274D/10061)
	Is the server running on that h

‚ö†Ô∏è DB n√£o dispon√≠vel ‚Äî salvo em CSV de fallback: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis\data_layer\silver\customers_backup.csv (99,441 linhas). Erro original: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5434 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5434 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)
‚ö†Ô∏è DB n√£o dispon√≠vel ‚Äî salvo em CSV de fallback: c:\Users\06440799169\Documents\sbd2\github\brazilian_e-commerce_analysis\data_layer\silver\geolocation_backup.csv (1,000,163 linhas). Erro original: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5434 failed: Connection refused (0x0000274D/10061)
	Is the server running on that h

KeyboardInterrupt: 

## 2.6.3 ‚Äî √çndices Silver

In [126]:
# 2.6.3 ‚Äî Criar √≠ndices estrat√©gicos

index_sql = f"""
SET search_path TO "{DB_SCHEMA}", public;

-- Chaves de busca frequentes
CREATE INDEX IF NOT EXISTS idx_orders_order_id        ON orders(order_id);
CREATE INDEX IF NOT EXISTS idx_orders_customer_id     ON orders(customer_id);

CREATE INDEX IF NOT EXISTS idx_items_order_id         ON order_items(order_id);
CREATE INDEX IF NOT EXISTS idx_items_product_id       ON order_items(product_id);
CREATE INDEX IF NOT EXISTS idx_items_seller_id        ON order_items(seller_id);

CREATE INDEX IF NOT EXISTS idx_payments_order_id      ON order_payments(order_id);
CREATE INDEX IF NOT EXISTS idx_reviews_order_id       ON order_reviews(order_id);

CREATE INDEX IF NOT EXISTS idx_products_product_id    ON products(product_id);
CREATE INDEX IF NOT EXISTS idx_products_category_name ON products(product_category_name);

CREATE INDEX IF NOT EXISTS idx_customers_customer_id  ON customers(customer_id);
CREATE INDEX IF NOT EXISTS idx_sellers_seller_id      ON sellers(seller_id);
"""

try:
    with engine.begin() as conn:
        conn.exec_driver_sql(index_sql)
        print("‚úÖ √çndices criados/garantidos.")
except Exception as e:
    print("‚ùå Erro criando √≠ndices:", e)


‚ùå Erro criando √≠ndices: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5435 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5435 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)


In [127]:
# 2.6.4 ‚Äî Valida√ß√µes: contagens e amostras

checks = [
    ("customers",            "customer_id"),
    ("orders",               "order_id"),
    ("order_items",          "order_id"),
    ("order_payments",       "order_id"),
    ("order_reviews",        "order_id"),
    ("products",             "product_id"),
    ("sellers",              "seller_id"),
    ("geolocation",          None),
    ("product_category_name_translation", None),
]

try:
    with engine.begin() as conn:
        conn.exec_driver_sql(f'SET search_path TO "{DB_SCHEMA}", public;')
        for table, pk in checks:
            cnt = conn.exec_driver_sql(f'SELECT COUNT(*) FROM "{table}";').scalar_one()
            print(f'üîé {table:<35} -> {cnt:,} linhas')
            # mostra 5 linhas
            q = f'SELECT * FROM "{table}" LIMIT 5;'
            df = pd.read_sql_query(text(q), conn)
            display(df)
except Exception as e:
    print("‚ö†Ô∏è N√£o consegui validar agora (prov√°vel DB indispon√≠vel). Detalhe:", e)


‚ö†Ô∏è N√£o consegui validar agora (prov√°vel DB indispon√≠vel). Detalhe: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5435 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5435 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)


In [128]:
# 8A.1 ‚Äî Conex√£o (reusa .env com PGPORT=5434)

DB_HOST   = os.getenv("PGHOST", "localhost")
DB_PORT   = os.getenv("PGPORT", "5434")  # <- porta nova
DB_NAME   = os.getenv("PGDATABASE", "olist")
DB_USER   = os.getenv("PGUSER", "postgres")
DB_PASS   = os.getenv("PGPASSWORD", "postgres")

engine = create_engine(
    f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}",
    future=True, pool_pre_ping=True
)

def exec_sql(sql: str):
    with engine.begin() as conn:
        conn.exec_driver_sql(sql)
    print("OK")


In [129]:
# 8A.2 ‚Äî Criar schema gold e views
views_sql = """
CREATE SCHEMA IF NOT EXISTS gold;

-- 1) Receita por pedido (base para agrega√ß√µes)
CREATE OR REPLACE VIEW gold.vw_revenue_per_order AS
SELECT op.order_id,
       SUM(op.payment_value) AS revenue
FROM   silver.order_payments op
GROUP  BY op.order_id;

-- 2) S√©ries mensais: pedidos, receita bruta e ticket m√©dio
CREATE OR REPLACE VIEW gold.vw_orders_monthly AS
SELECT DATE_TRUNC('month', o.order_purchase_timestamp)::date AS month,
       COUNT(*)                                   AS orders_count,
       SUM(COALESCE(r.revenue,0))                 AS gross_revenue,
       AVG(COALESCE(r.revenue,0))                 AS aov
FROM   silver.orders o
LEFT   JOIN gold.vw_revenue_per_order r USING (order_id)
GROUP  BY 1
ORDER  BY 1;

-- 3) Vendas por categoria (usa pre√ßo dos itens como proxy de receita)
CREATE OR REPLACE VIEW gold.vw_category_sales AS
SELECT COALESCE(t.product_category_name_english, p.product_category_name) AS category,
       SUM(oi.price)        AS sales_amount,
       SUM(oi.freight_value) AS freight_amount,
       COUNT(DISTINCT oi.order_id) AS orders
FROM   silver.order_items oi
JOIN   silver.products p USING (product_id)
LEFT   JOIN silver.product_category_name_translation t
       ON t.product_category_name = p.product_category_name
GROUP  BY 1
ORDER  BY sales_amount DESC;

-- 4) Mix de pagamento por m√™s
CREATE OR REPLACE VIEW gold.vw_payment_mix_monthly AS
SELECT DATE_TRUNC('month', o.order_purchase_timestamp)::date AS month,
       op.payment_type,
       SUM(op.payment_value) AS payment_value,
       COUNT(*)              AS payments
FROM   silver.orders o
JOIN   silver.order_payments op USING (order_id)
GROUP  BY 1,2
ORDER  BY 1,2;

-- 5) SLA de entrega (pontualidade mensal)
CREATE OR REPLACE VIEW gold.vw_delivery_sla AS
SELECT o.order_id,
       o.order_purchase_timestamp::date   AS purchase_date,
       o.order_estimated_delivery_date::date AS estimated_date,
       o.order_delivered_customer_date::date AS delivered_date,
       (o.order_delivered_customer_date - o.order_purchase_timestamp)       AS actual_lead_time,
       (o.order_estimated_delivery_date - o.order_purchase_timestamp)       AS estimated_lead_time,
       CASE WHEN o.order_delivered_customer_date IS NOT NULL
                 AND o.order_delivered_customer_date::date <= o.order_estimated_delivery_date::date
            THEN 1 ELSE 0 END AS on_time
FROM   silver.orders o
WHERE  o.order_status IN ('delivered','shipped','invoiced','processing');

CREATE OR REPLACE VIEW gold.vw_delivery_sla_monthly AS
SELECT DATE_TRUNC('month', purchase_date)::date AS month,
       COUNT(*)                                 AS delivered_orders,
       SUM(on_time)::int                        AS on_time_orders,
       ROUND(100.0*SUM(on_time)/NULLIF(COUNT(*),0), 2) AS on_time_rate
FROM   gold.vw_delivery_sla
WHERE  delivered_date IS NOT NULL
GROUP  BY 1
ORDER  BY 1;
"""
exec_sql(views_sql)


OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5434 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5434 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)