In [59]:
import pandas as pd
import numpy as np
import duckdb

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt

In [60]:
# Caminho do banco
db_path = "../../../data/duckdb/database.duckdb"

# Conex√£o com o banco DuckDB
con = duckdb.connect(db_path)

# Carrega os dados da camada bronze
df = con.execute("SELECT * FROM gold.consumo_geral").df()

# Feature Engineering

In [61]:
regions = df['region'].unique().tolist()

# Codifica√ß√£o da vari√°vel categ√≥rica
df = pd.get_dummies(df, columns=['region'], prefix='', prefix_sep='')

df[regions] = df[regions].astype(int)

# Treinamento de Modelo

In [62]:
# üîπ Separar vari√°veis
X = df.drop(['consumption_kwh','client_id'], axis=1).set_index('date')
y = df['consumption_kwh']

In [65]:
# Modelos e hiperpar√¢metros
models = {
    'XGBoost': {
        'model': XGBRegressor(objective='reg:squarederror', random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [3, 5],
            'learning_rate': [0.05, 0.1]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 5]
        }
    },
    'LightGBM': {
        'model': LGBMRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'num_leaves': [31, 50]
        }
    }
}

In [None]:
# üîπ Avalia√ß√£o dos modelos
results = []

for name, config in models.items():
    
    grid = GridSearchCV(config['model'], config['params'], cv=5, scoring='neg_root_mean_squared_error')
    grid.fit(X, y)
    
    best_rmse = -grid.best_score_
    best_params = grid.best_params_
    
    results.append((name, round(best_rmse, 2), best_params))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 515
[LightGBM] [Info] Number of data points in the train set: 13680, number of used features: 6
[LightGBM] [Info] Start training from score 14.786420
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 519
[LightGBM] [Info] Number of data points in the train set: 13680, number of used features: 7
[LightGBM] [Info] Start training from score 14.826436
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

In [67]:
# üîπ Exibir resultados
for name, rmse, params in results:
    print(f"{name} ‚Üí RMSE: {rmse} | Par√¢metros: {params}")

XGBoost ‚Üí RMSE: 3.72 | Par√¢metros: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
RandomForest ‚Üí RMSE: 3.72 | Par√¢metros: {'max_depth': 3, 'n_estimators': 100}
LightGBM ‚Üí RMSE: 3.72 | Par√¢metros: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 31}
