In [1]:
import pandas as pd
import numpy as np
import duckdb

from sklearn.linear_model import LinearRegression

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

import mlflow
from mlflow.models import infer_signature

In [None]:
# Caminho do banco
db_path = "../../data/duckdb/database.duckdb"

# Conexão com o banco DuckDB
con = duckdb.connect(db_path)

# Define raiz do projeto (ajuste conforme seu ambiente)
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")


# Carrega os dados da camada bronze
df = con.execute("""
    SELECT * FROM feature.previsao_consumo
""").df()

# Treinamento de Modelo

In [3]:
def calcular_metricas(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    nrmse = rmse / np.mean(y_true)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, nrmse, r2

def treinar_modelos_xgb_por_grupo_cv(df, features):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

    resultados = []

    grupos = df.groupby(['cluster', 'region'])

    for (cluster, region), grupo_df in grupos:
        print(f"\nTreinando modelo para Cluster {cluster} - Região {region}")

        # Define experimento
        mlflow.set_experiment(f"clusterizacao_cliente_{region}_{cluster}")
        model_name = f"clusterizacao_cliente_{region}_{cluster}"

        X = grupo_df[features]
        y = grupo_df['consumption_kwh']
        tscv = TimeSeriesSplit(n_splits=5)

        maes, rmses, nrmses, r2s = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model = xgb.XGBRegressor(
                n_estimators=100,
                learning_rate=0.05,
                max_depth=3,
                random_state=42,
                n_jobs=-1
            )

            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            mae, rmse, nrmse, r2 = calcular_metricas(y_val, y_pred)
            maes.append(mae)
            rmses.append(rmse)
            nrmses.append(nrmse)
            r2s.append(r2)

        with mlflow.start_run() as run:
            # Salvar o modelo final com todos os dados
            modelo_final = xgb.XGBRegressor(
                n_estimators=100,
                learning_rate=0.05,
                max_depth=3,
                random_state=42,
                n_jobs=-1
            )

            modelo_final.fit(X, y)

            mae = np.mean(maes)
            rmse = np.mean(rmses)
            nrmse = np.mean(nrmses)
            r2 = np.mean(r2s)

            # Log no MLflow
            mlflow.log_params({
                  'n_estimators': 100
                , 'learning_rate': 0.05
                , 'max_depth': 3
                , 'random_state': 42
                , 'n_jobs': -1
            })

            mlflow.log_metrics({
                  'mae': mae
                , 'rmse': rmse
                , 'nrmse': nrmse
                , 'r2': r2
            })

            # Inferir a assinatura do modelo
            signature = infer_signature(X, y)

            # Log do modelo com registro direto
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="model",
                signature=signature,
                input_example=X[:5],  # usar amostra pequena
                registered_model_name=model_name
            )

        resultados.append({
            'cluster': cluster,
            'region': region,
            'mae': np.mean(maes),
            'rmse': np.mean(rmses),
            'nrmse': np.mean(nrmses),
            'r2': np.mean(r2s)
        })

    return pd.DataFrame(resultados)


In [4]:
# Excluir colunas que não são features
features = [col for col in df.columns if col not in ['client_id', 'date', 'consumption_kwh', 'cluster', 'region','humidity','temperature']]

# Treinar modelos
resultados = treinar_modelos_xgb_por_grupo_cv(df, features)

# Exibir resultados
print(resultados.sort_values(by='nrmse'))



Treinando modelo para Cluster 0 - Região Leste




Registered model 'clusterizacao_cliente_Leste_0' already exists. Creating a new version of this model...
2025/07/29 20:55:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Leste_0, version 3


Created version '3' of model 'clusterizacao_cliente_Leste_0'.


🏃 View run wise-turtle-894 at: http://127.0.0.1:8080/#/experiments/403428675149349317/runs/fb89a7eba53a4304a448e6439cd515a2
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/403428675149349317

Treinando modelo para Cluster 0 - Região Norte




Registered model 'clusterizacao_cliente_Norte_0' already exists. Creating a new version of this model...
2025/07/29 20:55:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Norte_0, version 3


Created version '3' of model 'clusterizacao_cliente_Norte_0'.


🏃 View run whimsical-doe-403 at: http://127.0.0.1:8080/#/experiments/915273099384433581/runs/06ff5db2af6d4c5195dbd78495dc59d8
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/915273099384433581

Treinando modelo para Cluster 1 - Região Centro




Registered model 'clusterizacao_cliente_Centro_1' already exists. Creating a new version of this model...
2025/07/29 20:56:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Centro_1, version 3


Created version '3' of model 'clusterizacao_cliente_Centro_1'.


🏃 View run fortunate-shrew-141 at: http://127.0.0.1:8080/#/experiments/300217457269050798/runs/281e20bcc8c2487bbd4a3740fa82affd
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/300217457269050798

Treinando modelo para Cluster 1 - Região Oeste




Registered model 'clusterizacao_cliente_Oeste_1' already exists. Creating a new version of this model...
2025/07/29 20:56:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Oeste_1, version 3


Created version '3' of model 'clusterizacao_cliente_Oeste_1'.


🏃 View run traveling-moth-426 at: http://127.0.0.1:8080/#/experiments/813322908940726570/runs/9793aa62e7894fa882a1f3a8ecc7d0c4
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/813322908940726570

Treinando modelo para Cluster 2 - Região Leste




Registered model 'clusterizacao_cliente_Leste_2' already exists. Creating a new version of this model...
2025/07/29 20:56:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Leste_2, version 3


Created version '3' of model 'clusterizacao_cliente_Leste_2'.


🏃 View run colorful-slug-736 at: http://127.0.0.1:8080/#/experiments/912641830259411455/runs/610bf7a38af94341942cf5a4858fe012
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/912641830259411455

Treinando modelo para Cluster 2 - Região Norte




Registered model 'clusterizacao_cliente_Norte_2' already exists. Creating a new version of this model...
2025/07/29 20:56:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Norte_2, version 3


Created version '3' of model 'clusterizacao_cliente_Norte_2'.


🏃 View run awesome-seal-257 at: http://127.0.0.1:8080/#/experiments/759063953876893680/runs/294ded3807d24d9e8e1f1772b68dfac4
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/759063953876893680

Treinando modelo para Cluster 3 - Região Sul




Registered model 'clusterizacao_cliente_Sul_3' already exists. Creating a new version of this model...
2025/07/29 20:56:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Sul_3, version 3


Created version '3' of model 'clusterizacao_cliente_Sul_3'.


🏃 View run angry-bug-403 at: http://127.0.0.1:8080/#/experiments/899589497225334334/runs/c2e49da122dd4f42af6e912cc002c03b
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/899589497225334334

Treinando modelo para Cluster 4 - Região Centro




Registered model 'clusterizacao_cliente_Centro_4' already exists. Creating a new version of this model...
2025/07/29 20:56:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Centro_4, version 3


Created version '3' of model 'clusterizacao_cliente_Centro_4'.


🏃 View run likeable-shrike-723 at: http://127.0.0.1:8080/#/experiments/756877896251165727/runs/e22d32b82c3940d7b601671eb517b6c8
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/756877896251165727

Treinando modelo para Cluster 4 - Região Oeste




Registered model 'clusterizacao_cliente_Oeste_4' already exists. Creating a new version of this model...
2025/07/29 20:56:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: clusterizacao_cliente_Oeste_4, version 3


Created version '3' of model 'clusterizacao_cliente_Oeste_4'.


🏃 View run indecisive-shoat-544 at: http://127.0.0.1:8080/#/experiments/227549993923931794/runs/68156dda1e1a44258509f1931dcb52bf
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/227549993923931794
   cluster  region       mae      rmse     nrmse        r2
4        2   Leste  1.861938  2.313929  0.127730  0.117633
3        1   Oeste  1.875786  2.331130  0.128934 -0.045105
2        1  Centro  1.933846  2.381785  0.136966  0.003101
5        2   Norte  1.931608  2.411411  0.141453  0.150073
6        3     Sul  1.929341  2.424028  0.165003  0.571880
7        4  Centro  1.858881  2.342252  0.179350  0.244100
8        4   Oeste  1.896679  2.409500  0.183050  0.273973
0        0   Leste  1.885971  2.347685  0.199208  0.277638
1        0   Norte  1.858294  2.321190  0.199500  0.255714
