In [1]:
!pip install torch pytorch-lightning pytorch-forecasting pandas numpy requests yfinance matplotlib




[notice] A new release of pip is available: 24.0 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import warnings
import ssl
import requests
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt

# Bibliotecas de Deep Learning
import torch
import pytorch_lightning as pl
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, Baseline, QuantileLoss
from pytorch_forecasting.data import GroupNormalizer

# ==============================================================================
# CONFIGURA√á√ÉO DE AMBIENTE CORPORATIVO (BYPASS SSL)
# ==============================================================================
# Desabilita avisos de seguran√ßa para permitir download do BCB/Yahoo
warnings.filterwarnings("ignore")
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Configura Seeds para reprodutibilidade
pl.seed_everything(42)

  from tqdm.autonotebook import tqdm
Seed set to 42


42

# Carga e Tratamento dos Dados Granulares (Painel)

In [None]:
# ==============================================================================
# 1. CARGA E LIMPEZA INICIAL (Parquet)
# ==============================================================================
# L√™ os parquets salvos no main.ipynb e recria df_retail_transformed

import os

parquet_retail = "df_retail.parquet"
parquet_dim = "df_dim_vehicle.parquet"

if 'df_retail_transformed' in locals():
    df = df_retail_transformed.copy()
else:
    if os.path.exists(parquet_retail) and os.path.exists(parquet_dim):
        df_retail = pd.read_parquet(parquet_retail)
        df_dim_vehicle = pd.read_parquet(parquet_dim)

        df_retail = df_retail[df_retail["MOVIMENT_ID"] == 14]
        df_retail['DATE_TIME'] = pd.to_datetime(df_retail['DATE_TIME'])
        df_retail_deduplicated = df_retail.sort_values(['CHASSIS', 'DATE_TIME']).drop_duplicates(['CHASSIS'], keep='last')

        df_retail_enrich = df_retail_deduplicated.merge(
            df_dim_vehicle,
            left_on="CHASSIS",
            right_on="CHASSIS_NBR",
            how="inner"
        )

        df_retail_enrich['YEAR'] = df_retail_enrich['DATE_TIME'].dt.year
        df_retail_enrich['MONTH'] = df_retail_enrich['DATE_TIME'].dt.month

        if 'SEGMENTATION' not in df_retail_enrich.columns:
            df_retail_enrich['SEGMENTATION'] = 'UNKNOWN'

        df = (
            df_retail_enrich
            .groupby(["LINE", "SEGMENTATION", "YEAR", "MONTH"])
            .size()
            .reset_index(name='retail_total')
        )
    else:
        raise FileNotFoundError("Parquet n√£o encontrado. Salve df_retail.parquet e df_dim_vehicle.parquet primeiro.")

# Converter datas
if 'Date' not in df.columns:
    df['day'] = 1
    df['Date'] = pd.to_datetime(df[['YEAR', 'MONTH', 'day']])

# Agrupamento Temporal Mensal (Primeiro dia do m√™s)
df['date_month'] = df['Date'].dt.to_period('M').dt.to_timestamp()

# ==============================================================================
# 2. CRIA√á√ÉO DO PAINEL (PANEL DATA)
# ==============================================================================
# Definindo os grupos que formam nossas s√©ries temporais
groups = ['LINE', 'SEGMENTATION']

# Limpeza de Nulos nas Categorias (TFT n√£o aceita NaN em categorias)
for col in groups:
    df[col] = df[col].fillna('UNKNOWN').astype(str)

# Agrega√ß√£o: Soma de vendas por M√™s + Linha + Segmenta√ß√£o
print("üî® Agregando dados em Painel Granular...")
df_panel = df.groupby(['date_month'] + groups)['retail_total'].sum().reset_index(name='sales_qty')

# ==============================================================================
# 3. PREENCHIMENTO DE LACUNAS (ZERO SALES)
# ==============================================================================
import itertools

all_dates = pd.date_range(start=df_panel['date_month'].min(), end=df_panel['date_month'].max(), freq='MS')
unique_groups = df_panel[groups].drop_duplicates().values.tolist()

combinations = list(itertools.product(all_dates, unique_groups))
df_skeleton = pd.DataFrame(combinations, columns=['date_month', 'groups'])

df_skeleton[groups] = pd.DataFrame(df_skeleton['groups'].tolist(), index=df_skeleton.index)
df_skeleton = df_skeleton.drop(columns=['groups'])

data = pd.merge(df_skeleton, df_panel, on=['date_month'] + groups, how='left')
data['sales_qty'] = data['sales_qty'].fillna(0)

print(f"‚úÖ Painel Criado! Linhas totais: {len(data)}")
print(f"Exemplo de S√©ries: {len(unique_groups)} combina√ß√µes de Linha/Segmento.")
data.head()

‚ö†Ô∏è Usando DF de exemplo (Certifique-se de carregar seus dados reais antes!)


NameError: name 'df_retail_transformed' is not defined

Enriquecimento Macro (Selic, Dolar, Soja)

In [None]:
# ==============================================================================
# FUN√á√ïES DE DOWNLOAD (MACROECONOMIA)
# ==============================================================================
def get_bcb_data_bypass(code, name, start_date):
    url = f'http://api.bcb.gov.br/dados/serie/bcdata.sgs.{code}/dados?formato=json'
    headers = {'User-Agent': 'Mozilla/5.0 Chrome/58.0.3029.110'}
    try:
        response = requests.get(url, headers=headers, verify=False)
        if response.status_code == 200:
            df = pd.DataFrame(response.json())
            df['data'] = pd.to_datetime(df['data'], dayfirst=True)
            df = df.set_index('data').loc[start_date:]
            return df.resample('MS').mean().rename(columns={df.columns[0]: name})
    except:
        pass
    return pd.DataFrame()

# Download dos dados
start_date = data['date_month'].min().strftime('%Y-%m-%d')
print(f"‚è≥ Baixando dados macro a partir de {start_date}...")

df_selic = get_bcb_data_bypass(432, 'Selic', start_date)
df_dolar = get_bcb_data_bypass(1, 'Dolar', start_date)

# Soja (Yahoo)
try:
    soy = yf.download('ZS=F', start=start_date, progress=False)
    # Tratamento de vers√µes do yfinance
    if isinstance(soy.columns, pd.MultiIndex): 
        col = soy['Adj Close'] if 'Adj Close' in soy.columns else soy['Close']
    else:
        col = soy[['Adj Close']] if 'Adj Close' in soy.columns else soy[['Close']]
    df_soja = col.resample('MS').mean()
    df_soja.columns = ['Soja_Price']
except:
    print("‚ö†Ô∏è Yahoo falhou. Usando mock para soja.")
    df_soja = pd.DataFrame(index=pd.date_range(start=start_date, periods=len(df_selic)+12, freq='MS'))
    df_soja['Soja_Price'] = 30.0 # Mock

# Merge Macro
df_macro = df_selic.join([df_dolar, df_soja], how='outer')

# Merge Final no Dataset Principal
data = pd.merge(data, df_macro, left_on='date_month', right_index=True, how='left')

# Preenchimento (Forward Fill para gaps, 0 para missing total)
data = data.sort_values(['LINE', 'SEGMENTATION', 'date_month']).ffill().bfill()
print("‚úÖ Dados Macro Integrados.")

Prepara√ß√£o para o Transformer (TimeSeriesDataSet)

In [None]:
# ==============================================================================
# PREPARA√á√ÉO DO DATASET PARA O PYTORCH FORECASTING
# ==============================================================================
# 1. √çndice de Tempo Inteiro (Obrigat√≥rio para TFT)
data['time_idx'] = (data['date_month'].dt.year * 12 + data['date_month'].dt.month)
data['time_idx'] -= data['time_idx'].min()

# 2. Feature Engineering Adicional
data['month_str'] = data['date_month'].dt.month.astype(str) # Sazonalidade Categ√≥rica
data['log_sales'] = np.log1p(data['sales_qty']) # Estabiliza a vari√¢ncia (Opcional, mas ajuda)

# 3. Configura√ß√£o da Janela de Tempo (Op√ß√£o B - M√©dio Prazo)
max_prediction_length = 6   # Prever 6 meses
max_encoder_length = 24     # Olhar 24 meses para tr√°s
training_cutoff = data["time_idx"].max() - max_prediction_length

# 4. Defini√ß√£o do TimeSeriesDataSet
training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="sales_qty",
    group_ids=groups, # ['LINE', 'SEGMENTATION'] -> Isso cria o Multi-S√©rie
    
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    
    static_categoricals=groups, # Embeddings fixos por s√©rie
    time_varying_known_categoricals=["month_str"], # Sazonalidade (sabemos o m√™s futuro)
    
    # Vari√°veis Reais (Num√©ricas)
    time_varying_known_reals=["time_idx"], 
    time_varying_unknown_reals=[
        "sales_qty", 
        "Selic", 
        "Dolar", 
        "Soja_Price"
    ],
    
    # Normaliza√ß√£o por Grupo (Crucial para s√©ries com volumes diferentes)
    target_normalizer=GroupNormalizer(
        groups=groups, transformation="softplus"
    ), 
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# Dataloaders
batch_size = 64  # Ajuste para 32 se der erro de mem√≥ria
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

print("‚úÖ TimeSeriesDataSet criado com sucesso!")

Treinamento (Training Loop)

In [None]:
# ==============================================================================
# TREINAMENTO DO MODELO (TFT)
# ==============================================================================
# Checkpoint Callback (Salva o melhor modelo baseado no erro de valida√ß√£o)
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

checkpoint_callback = ModelCheckpoint(monitor="val_loss", mode="min")
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=5, verbose=False, mode="min")

# Trainer Setup
trainer = pl.Trainer(
    max_epochs=30,           # 30 √©pocas costuma ser bom para esse tamanho de dado
    accelerator="auto",      # Usa GPU se dispon√≠vel
    gradient_clip_val=0.1,   # Evita explos√£o do gradiente
    callbacks=[checkpoint_callback, early_stop_callback],
    enable_model_summary=True,
)

# Inicializa o TFT
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,          # Tamanho da rede (Aumente para 64 se tiver muitos dados)
    attention_head_size=1,   # Cabe√ßas de aten√ß√£o
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,           # 7 Quantis por padr√£o (incerteza)
    loss=QuantileLoss(),     # Fun√ß√£o de perda probabil√≠stica
    reduce_on_plateau_patience=4,
)

print(f"üß† Iniciando treinamento do TFT...")
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

# Carrega o melhor modelo treinado
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
print(f"‚úÖ Treinamento conclu√≠do. Melhor modelo carregado.")

Visualiza√ß√£o e Interpretabilidade

In [None]:
# ==============================================================================
# AVALIA√á√ÉO E PREVIS√ÉO
# ==============================================================================
# Fazendo a previs√£o nos dados de valida√ß√£o (√∫ltimos 6 meses)
raw_predictions = best_tft.predict(val_dataloader, mode="raw", return_x=True)

# Fun√ß√£o para plotar uma s√©rie aleat√≥ria do dataset
import random

# Escolhemos alguns exemplos para mostrar (ex: 3 s√©ries diferentes)
for i in range(3):
    idx = random.randint(0, len(raw_predictions.output) - 1)
    
    # Descobrindo qual s√©rie √© essa (Qual Linha/Segmento?)
    # O Pytorch Forecasting esconde isso no decoder_target, vamos simplificar plotando direto
    
    fig, ax = plt.subplots(figsize=(12, 5))
    best_tft.plot_prediction(raw_predictions.x, raw_predictions.output, idx=idx, add_loss_to_title=True, ax=ax)
    plt.title(f"Previs√£o Probabil√≠stica - Exemplo {i+1} (Intervalo de Confian√ßa)")
    plt.show()

# ==============================================================================
# INTERPRETABILIDADE (Variable Importance)
# ==============================================================================
# O que o modelo aprendeu que √© importante?
interpretation = best_tft.interpret_output(raw_predictions.output, reduction="sum")
best_tft.plot_interpretation(interpretation)