In [1]:
import pandas as pd
import numpy as np
import duckdb

from sklearn.linear_model import LinearRegression

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

import mlflow
from mlflow.models import infer_signature

In [2]:
# Caminho do banco
db_path = "../../data/duckdb/database.duckdb"

# Conexão com o banco DuckDB
con = duckdb.connect(db_path)

# Define raiz do projeto (ajuste conforme seu ambiente)
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")


# Carrega os dados da camada bronze
df = con.execute("""
    SELECT * FROM feature.previsao_consumo
""").df()

# Treinamento de Modelo

In [3]:
def calcular_metricas(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    nrmse = rmse / np.mean(y_true)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, nrmse, r2

def treinar_modelos_xgb_por_grupo_cv(df, features):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

    resultados = []

    grupos = df.groupby(['cluster', 'region'])

    for (cluster, region), grupo_df in grupos:
        print(f"\nTreinando modelo para Cluster {cluster} - Região {region}")

        # Define experimento
        mlflow.set_experiment(f"previsao_cliente_{region}_{cluster}")
        model_name = f"previsao_cliente_{region}_{cluster}"

        X = grupo_df[features]
        y = grupo_df['consumption_kwh']
        tscv = TimeSeriesSplit(n_splits=5)

        maes, rmses, nrmses, r2s = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model = xgb.XGBRegressor(
                n_estimators=100,
                learning_rate=0.05,
                max_depth=3,
                random_state=42,
                n_jobs=-1
            )

            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            mae, rmse, nrmse, r2 = calcular_metricas(y_val, y_pred)
            maes.append(mae)
            rmses.append(rmse)
            nrmses.append(nrmse)
            r2s.append(r2)

        with mlflow.start_run() as run:
            # Salvar o modelo final com todos os dados
            modelo_final = xgb.XGBRegressor(
                n_estimators=100,
                learning_rate=0.05,
                max_depth=3,
                random_state=42,
                n_jobs=-1
            )

            modelo_final.fit(X, y)

            mae = np.mean(maes)
            rmse = np.mean(rmses)
            nrmse = np.mean(nrmses)
            r2 = np.mean(r2s)

            # Log no MLflow
            mlflow.log_params({
                  'n_estimators': 100
                , 'learning_rate': 0.05
                , 'max_depth': 3
                , 'random_state': 42
                , 'n_jobs': -1
            })

            mlflow.log_metrics({
                  'mae': mae
                , 'rmse': rmse
                , 'nrmse': nrmse
                , 'r2': r2
            })

            # Inferir a assinatura do modelo
            signature = infer_signature(X, y)

            # Log do modelo com registro direto
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="model",
                signature=signature,
                input_example=X[:5],  # usar amostra pequena
                registered_model_name=model_name
            )

        resultados.append({
            'cluster': cluster,
            'region': region,
            'mae': np.mean(maes),
            'rmse': np.mean(rmses),
            'nrmse': np.mean(nrmses),
            'r2': np.mean(r2s)
        })

    return pd.DataFrame(resultados)


In [4]:
# Excluir colunas que não são features
features = [col for col in df.columns if col not in ['client_id', 'date', 'consumption_kwh', 'cluster', 'region','humidity','temperature']]

# Treinar modelos
resultados = treinar_modelos_xgb_por_grupo_cv(df, features)

# Exibir resultados
print(resultados.sort_values(by='nrmse'))


2025/07/30 13:39:26 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Leste_0' does not exist. Creating a new experiment.



Treinando modelo para Cluster 0 - Região Leste




Successfully registered model 'previsao_cliente_Leste_0'.
2025/07/30 13:39:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Leste_0, version 1


Created version '1' of model 'previsao_cliente_Leste_0'.


2025/07/30 13:39:32 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Norte_0' does not exist. Creating a new experiment.


🏃 View run shivering-robin-649 at: http://127.0.0.1:8080/#/experiments/129153900224731881/runs/f9024ece413848e78c1eff6d1382fba5
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/129153900224731881

Treinando modelo para Cluster 0 - Região Norte




Successfully registered model 'previsao_cliente_Norte_0'.
2025/07/30 13:39:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Norte_0, version 1


Created version '1' of model 'previsao_cliente_Norte_0'.


2025/07/30 13:39:36 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Centro_1' does not exist. Creating a new experiment.


🏃 View run nosy-carp-386 at: http://127.0.0.1:8080/#/experiments/959656413999781364/runs/00534c2a4a734783ae3f402452db42f4
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/959656413999781364

Treinando modelo para Cluster 1 - Região Centro




Successfully registered model 'previsao_cliente_Centro_1'.
2025/07/30 13:39:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Centro_1, version 1


Created version '1' of model 'previsao_cliente_Centro_1'.


2025/07/30 13:39:41 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Oeste_1' does not exist. Creating a new experiment.


🏃 View run persistent-horse-416 at: http://127.0.0.1:8080/#/experiments/395457158260064073/runs/e9bba81c8b104a7989ef98dd8d9cf851
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/395457158260064073

Treinando modelo para Cluster 1 - Região Oeste




Successfully registered model 'previsao_cliente_Oeste_1'.
2025/07/30 13:39:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Oeste_1, version 1


Created version '1' of model 'previsao_cliente_Oeste_1'.


2025/07/30 13:39:45 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Leste_2' does not exist. Creating a new experiment.


🏃 View run caring-hog-344 at: http://127.0.0.1:8080/#/experiments/265871612882140850/runs/4b0be97a40c74a38bc9ee5079e3aeb5a
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/265871612882140850

Treinando modelo para Cluster 2 - Região Leste




Successfully registered model 'previsao_cliente_Leste_2'.
2025/07/30 13:39:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Leste_2, version 1


Created version '1' of model 'previsao_cliente_Leste_2'.


2025/07/30 13:39:50 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Norte_2' does not exist. Creating a new experiment.


🏃 View run legendary-seal-262 at: http://127.0.0.1:8080/#/experiments/316847630448806260/runs/5a02844cf6994664b2254975163c9697
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/316847630448806260

Treinando modelo para Cluster 2 - Região Norte




Successfully registered model 'previsao_cliente_Norte_2'.
2025/07/30 13:39:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Norte_2, version 1


Created version '1' of model 'previsao_cliente_Norte_2'.


2025/07/30 13:39:54 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Sul_3' does not exist. Creating a new experiment.


🏃 View run persistent-cod-473 at: http://127.0.0.1:8080/#/experiments/727146017675202939/runs/d52f0b9e5907412c87d538eca0691a11
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/727146017675202939

Treinando modelo para Cluster 3 - Região Sul




Successfully registered model 'previsao_cliente_Sul_3'.
2025/07/30 13:39:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Sul_3, version 1


Created version '1' of model 'previsao_cliente_Sul_3'.


2025/07/30 13:39:59 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Centro_4' does not exist. Creating a new experiment.


🏃 View run treasured-chimp-85 at: http://127.0.0.1:8080/#/experiments/734325788788891594/runs/f1d4e9c9a0f843e18db6bc33d1e214bf
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/734325788788891594

Treinando modelo para Cluster 4 - Região Centro




Successfully registered model 'previsao_cliente_Centro_4'.
2025/07/30 13:40:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Centro_4, version 1


Created version '1' of model 'previsao_cliente_Centro_4'.


2025/07/30 13:40:04 INFO mlflow.tracking.fluent: Experiment with name 'previsao_cliente_Oeste_4' does not exist. Creating a new experiment.


🏃 View run victorious-shrew-192 at: http://127.0.0.1:8080/#/experiments/255249810038885859/runs/c51fa6ae5f4945ce8ac182d7cd6c5644
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/255249810038885859

Treinando modelo para Cluster 4 - Região Oeste




Successfully registered model 'previsao_cliente_Oeste_4'.
2025/07/30 13:40:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: previsao_cliente_Oeste_4, version 1


Created version '1' of model 'previsao_cliente_Oeste_4'.


🏃 View run upbeat-rat-498 at: http://127.0.0.1:8080/#/experiments/469050175248156224/runs/8313b7dfa372473e86a506bb491da073
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/469050175248156224
   cluster  region       mae      rmse     nrmse        r2
4        2   Leste  1.861938  2.313929  0.127730  0.117633
3        1   Oeste  1.875786  2.331130  0.128934 -0.045105
2        1  Centro  1.933846  2.381785  0.136966  0.003101
5        2   Norte  1.931608  2.411411  0.141453  0.150073
6        3     Sul  1.929341  2.424028  0.165003  0.571880
7        4  Centro  1.858881  2.342252  0.179350  0.244100
8        4   Oeste  1.896679  2.409500  0.183050  0.273973
0        0   Leste  1.885971  2.347685  0.199208  0.277638
1        0   Norte  1.858294  2.321190  0.199500  0.255714


In [5]:
con.execute("""
CREATE TABLE IF NOT EXISTS output.previsao_consumo_metricas (
    cluster INTEGER,
    region VARCHAR,
    mae DOUBLE,
    rmse DOUBLE,
    nrmse DOUBLE,
    r2 DOUBLE,
)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x26113b75a70>

In [6]:
# Limpa dados se as tabelas já existirem
con.execute("DELETE FROM output.previsao_consumo_metricas")

<duckdb.duckdb.DuckDBPyConnection at 0x26113b75a70>

In [7]:
# Registra como tabelas temporárias
con.register("resultados", resultados)

<duckdb.duckdb.DuckDBPyConnection at 0x26113b75a70>

In [8]:
# Insere os dados nas tabelas gold
con.execute("INSERT INTO output.previsao_consumo_metricas SELECT * FROM resultados")

<duckdb.duckdb.DuckDBPyConnection at 0x26113b75a70>

In [9]:
con.close()