In [1]:
import pandas as pd
import numpy as np
import duckdb

from sklearn.linear_model import LinearRegression

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

import mlflow
from mlflow.models import infer_signature

In [2]:
# Caminho do banco
db_path = "../../data/duckdb/database.duckdb"

# Conexão com o banco DuckDB
con = duckdb.connect(db_path)

# Define raiz do projeto (ajuste conforme seu ambiente)
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")


# Carrega os dados da camada bronze
df = con.execute("""
    SELECT * FROM feature.previsao_consumo
""").df()

# Treinamento de Modelo

In [3]:
def calcular_metricas(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    nrmse = rmse / np.mean(y_true)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, nrmse, r2

def treinar_modelos_xgb_por_grupo_cv(df, features):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

    resultados = []

    grupos = df.groupby(['cluster', 'region'])

    for (cluster, region), grupo_df in grupos:
        print(f"\nTreinando modelo para Cluster {cluster} - Região {region}")

        # Define experimento
        mlflow.set_experiment(f"previsao_cliente_{region}_{cluster}")
        model_name = f"previsao_cliente_{region}_{cluster}"

        X = grupo_df[features]
        y = grupo_df['consumption_kwh']
        tscv = TimeSeriesSplit(n_splits=5)

        maes, rmses, nrmses, r2s = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model = xgb.XGBRegressor(
                n_estimators=100,
                learning_rate=0.05,
                max_depth=3,
                random_state=42,
                n_jobs=-1
            )

            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            mae, rmse, nrmse, r2 = calcular_metricas(y_val, y_pred)
            maes.append(mae)
            rmses.append(rmse)
            nrmses.append(nrmse)
            r2s.append(r2)

        with mlflow.start_run() as run:
            # Salvar o modelo final com todos os dados
            modelo_final = xgb.XGBRegressor(
                n_estimators=100,
                learning_rate=0.05,
                max_depth=3,
                random_state=42,
                n_jobs=-1
            )

            modelo_final.fit(X, y)

            mae = np.mean(maes)
            rmse = np.mean(rmses)
            nrmse = np.mean(nrmses)
            r2 = np.mean(r2s)

            # Log no MLflow
            mlflow.log_params({
                  'n_estimators': 100
                , 'learning_rate': 0.05
                , 'max_depth': 3
                , 'random_state': 42
                , 'n_jobs': -1
            })

            mlflow.log_metrics({
                  'mae': mae
                , 'rmse': rmse
                , 'nrmse': nrmse
                , 'r2': r2
            })

            # Inferir a assinatura do modelo
            signature = infer_signature(X, y)

            # Log do modelo com registro direto
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="model",
                signature=signature,
                input_example=X[:5],  # usar amostra pequena
                registered_model_name=model_name
            )

        resultados.append({
            'cluster': cluster,
            'region': region,
            'mae': np.mean(maes),
            'rmse': np.mean(rmses),
            'nrmse': np.mean(nrmses),
            'r2': np.mean(r2s)
        })

    return pd.DataFrame(resultados)


In [4]:
# Excluir colunas que não são features
features = [col for col in df.columns if col not in ['client_id', 'date', 'consumption_kwh', 'cluster', 'region','humidity','temperature']]

# Treinar modelos
resultados = treinar_modelos_xgb_por_grupo_cv(df, features)

# Exibir resultados
print(resultados.sort_values(by='nrmse'))



Treinando modelo para Cluster 0 - Região Leste




Registered model 'previsao_cliente_Leste_0' already exists. Creating a new version of this model...
2025/07/30 16:23:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Leste_0, version 2


Created version '2' of model 'previsao_cliente_Leste_0'.


🏃 View run worried-crow-251 at: http://127.0.0.1:8080/#/experiments/129153900224731881/runs/5812ad595d8b4c9a80e3f3e1baee34ed
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/129153900224731881

Treinando modelo para Cluster 0 - Região Norte




Registered model 'previsao_cliente_Norte_0' already exists. Creating a new version of this model...
2025/07/30 16:23:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Norte_0, version 2


Created version '2' of model 'previsao_cliente_Norte_0'.


🏃 View run puzzled-panda-402 at: http://127.0.0.1:8080/#/experiments/959656413999781364/runs/794b3e1b73bc4c568c6620fe6e73e903
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/959656413999781364

Treinando modelo para Cluster 1 - Região Centro




Registered model 'previsao_cliente_Centro_1' already exists. Creating a new version of this model...
2025/07/30 16:23:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Centro_1, version 2


Created version '2' of model 'previsao_cliente_Centro_1'.


🏃 View run resilient-doe-238 at: http://127.0.0.1:8080/#/experiments/395457158260064073/runs/943aa0910cbc4b5b84e251b858e0cfc8
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/395457158260064073

Treinando modelo para Cluster 1 - Região Oeste




Registered model 'previsao_cliente_Oeste_1' already exists. Creating a new version of this model...
2025/07/30 16:23:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Oeste_1, version 2


Created version '2' of model 'previsao_cliente_Oeste_1'.


🏃 View run enthused-mule-198 at: http://127.0.0.1:8080/#/experiments/265871612882140850/runs/a5a751e59bca49ef8481f4e2d288b51a
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/265871612882140850

Treinando modelo para Cluster 2 - Região Leste




Registered model 'previsao_cliente_Leste_2' already exists. Creating a new version of this model...
2025/07/30 16:23:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Leste_2, version 2


Created version '2' of model 'previsao_cliente_Leste_2'.


🏃 View run able-whale-670 at: http://127.0.0.1:8080/#/experiments/316847630448806260/runs/af14b199b46042e9a6e54c9208844347
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/316847630448806260

Treinando modelo para Cluster 2 - Região Norte




Registered model 'previsao_cliente_Norte_2' already exists. Creating a new version of this model...
2025/07/30 16:23:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Norte_2, version 2


Created version '2' of model 'previsao_cliente_Norte_2'.


🏃 View run unequaled-tern-969 at: http://127.0.0.1:8080/#/experiments/727146017675202939/runs/c278b0322b63432ba5ecbe6c74503b0a
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/727146017675202939

Treinando modelo para Cluster 3 - Região Sul




Registered model 'previsao_cliente_Sul_3' already exists. Creating a new version of this model...
2025/07/30 16:23:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Sul_3, version 2


Created version '2' of model 'previsao_cliente_Sul_3'.


🏃 View run indecisive-worm-912 at: http://127.0.0.1:8080/#/experiments/734325788788891594/runs/e8fb7a9bf079483998057f69f56083c5
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/734325788788891594

Treinando modelo para Cluster 4 - Região Centro




Registered model 'previsao_cliente_Centro_4' already exists. Creating a new version of this model...
2025/07/30 16:23:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Centro_4, version 2


Created version '2' of model 'previsao_cliente_Centro_4'.


🏃 View run puzzled-squid-45 at: http://127.0.0.1:8080/#/experiments/255249810038885859/runs/b2dacd85dc9d4139959b96757cf4371b
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/255249810038885859

Treinando modelo para Cluster 4 - Região Oeste




Registered model 'previsao_cliente_Oeste_4' already exists. Creating a new version of this model...
2025/07/30 16:23:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Oeste_4, version 2


Created version '2' of model 'previsao_cliente_Oeste_4'.


🏃 View run orderly-grub-102 at: http://127.0.0.1:8080/#/experiments/469050175248156224/runs/a095ab8d17314db6a451ad504085f36c
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/469050175248156224
   cluster  region       mae      rmse     nrmse        r2
4        2   Leste  1.861938  2.313929  0.127730  0.117633
3        1   Oeste  1.875786  2.331130  0.128934 -0.045105
2        1  Centro  1.933846  2.381785  0.136966  0.003101
5        2   Norte  1.931608  2.411411  0.141453  0.150073
6        3     Sul  1.929341  2.424028  0.165003  0.571880
7        4  Centro  1.858881  2.342252  0.179350  0.244100
8        4   Oeste  1.896679  2.409500  0.183050  0.273973
0        0   Leste  1.885971  2.347685  0.199208  0.277638
1        0   Norte  1.858294  2.321190  0.199500  0.255714


In [5]:
con.execute("""
CREATE TABLE IF NOT EXISTS output.previsao_consumo_metricas (
    cluster INTEGER,
    region VARCHAR,
    mae DOUBLE,
    rmse DOUBLE,
    nrmse DOUBLE,
    r2 DOUBLE,
)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x15d567855f0>

In [6]:
# Limpa dados se as tabelas já existirem
con.execute("DELETE FROM output.previsao_consumo_metricas")

<duckdb.duckdb.DuckDBPyConnection at 0x15d567855f0>

In [7]:
# Registra como tabelas temporárias
con.register("resultados", resultados)

<duckdb.duckdb.DuckDBPyConnection at 0x15d567855f0>

In [8]:
# Insere os dados nas tabelas gold
con.execute("INSERT INTO output.previsao_consumo_metricas SELECT * FROM resultados")

<duckdb.duckdb.DuckDBPyConnection at 0x15d567855f0>

In [9]:
con.close()