# 02 – Modelagem Baseline

## 1. Setup
- Import das bibliotecas
- Conexão ao Postgres e leitura das tabelas Silver


In [4]:
# Ajusta o path para apontar à raiz do projeto (/app) dentro do container
import sys, os
sys.path.insert(0, os.path.abspath('..'))


In [5]:
import pandas as pd
import sqlalchemy as sa
from src.utils.config import settings

# criação da engine
engine = sa.create_engine(
    f"postgresql+psycopg://{settings.PG_USER}:"
    f"{settings.PG_PASSWORD}@{settings.PG_HOST}:"
    f"{settings.PG_PORT}/{settings.PG_DB}"
)

# leitura das tabelas Silver
df_orders   = pd.read_sql("SELECT * FROM silver.orders_raw", engine)
df_payments = pd.read_sql("SELECT order_id, payment_value, freight_value FROM silver.order_items_raw JOIN silver.order_payments_raw USING(order_id)", engine)
df_reviews  = pd.read_sql("SELECT order_id, review_score FROM silver.order_reviews_raw", engine)

print("orders  :", df_orders.shape)
print("payments:", df_payments.shape)
print("reviews :", df_reviews.shape)


orders  : (99441, 8)
payments: (0, 3)
reviews : (0, 2)


## 2. Preparação dos dados
- Merge das tabelas
- Cálculo de `delay_days`
- Criação do target `y`
- Cálculo de `lead_time_est`
- Extração de `X` e `y`


In [6]:
# Merge orders + payments + reviews
df = (
    df_orders
    .merge(df_payments[['order_id', 'payment_value', 'freight_value']], on='order_id', how='left')
    .merge(df_reviews[['order_id', 'review_score']], on='order_id', how='left')
)

# delay_days (real – estimado) em dias
df['delay_days'] = (
    pd.to_datetime(df['order_delivered_customer_date']) -
    pd.to_datetime(df['order_estimated_delivery_date'])
).dt.total_seconds() / (24*3600)

# target binário: 1 = atrasado, 0 = no prazo ou antecipado
df['y'] = (df['delay_days'] > 0).astype(int)

# lead_time_est (estimado – compra) em dias
df['lead_time_est'] = (
    pd.to_datetime(df['order_estimated_delivery_date']) -
    pd.to_datetime(df['order_purchase_timestamp'])
).dt.total_seconds() / (24*3600)

# features e target
features = ['payment_value', 'freight_value', 'lead_time_est']
X = df[features].fillna(0)
y = df['y']

print("X:", X.shape, "y:", y.shape, "Prop. atrasos:", round(y.mean(), 3))

X: (99441, 3) y: (99441,) Prop. atrasos: 0.079


  X = df[features].fillna(0)


## 3. Split treino/teste
- train_test_split (80% treino, 20% teste)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Treino:", X_train.shape, y_train.value_counts(normalize=True).to_dict())
print("Teste :", X_test.shape, y_test.value_counts(normalize=True).to_dict())


Treino: (79552, 3) {0: 0.9212841914722446, 1: 0.07871580852775543}
Teste : (19889, 3) {0: 0.9213132887525768, 1: 0.0786867112474232}


## 4. Baseline com LogisticRegression
- Treinamento e avaliação com classification_report e AUC

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# treinando o modelo
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# previsões
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

# métricas
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", round(roc_auc_score(y_test, y_proba), 4))


Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     18324
           1       0.00      0.00      0.00      1565

    accuracy                           0.92     19889
   macro avg       0.46      0.50      0.48     19889
weighted avg       0.85      0.92      0.88     19889

ROC AUC Score: 0.5477


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 5. Conclusão Inicial
- Exibir resultados obtidos e comentar:
  - Acurácia / Precision / Recall
  - AUC
- Próximos passos: feature engineering avançado, tuning, deploy
