In [11]:
import pandas as pd
import numpy as np
import duckdb

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score,make_scorer, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold,cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
import lightgbm as lgb

import mlflow
import mlflow.sklearn

import joblib
import os
import shutil


# Carregamento dos dados

In [12]:
# Caminho do banco
db_path = "../../data/duckdb/database.duckdb"

# Conexão com o banco DuckDB
con = duckdb.connect(db_path)

# Carrega os dados da camada bronze
clientes_df = con.execute("SELECT * FROM silver.clientes").df()
consumo_df = con.execute("SELECT * FROM silver.consumo").df()

df = consumo_df.merge(clientes_df, on='client_id', how='inner')

Unnamed: 0,client_id,date,consumption_kwh,region
0,C0000,2023-01-01,18.64,Norte
1,C0000,2023-01-02,16.63,Norte
2,C0000,2023-01-03,18.11,Norte
3,C0000,2023-01-04,18.25,Norte
4,C0000,2023-01-05,19.81,Norte
5,C0000,2023-01-06,15.87,Norte
6,C0000,2023-01-07,20.3,Norte
7,C0000,2023-01-08,19.35,Norte
8,C0000,2023-01-09,18.3,Norte
9,C0000,2023-01-10,13.34,Norte


# Feature Engineering

In [13]:
# Feature engineering por client_id
class TemporalFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    # é necessário implementar os métodos fit para que o objeto possa ser usado em um pipeline
    def fit(self, X, y=None):
        return self

    # o método transform define como os dados serão transformados
    def transform(self, X):

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame")
        
        df = X.copy()

        # Garantir que a coluna 'date' esteja no formato datetime
        df["date"] = pd.to_datetime(df["date"])
        
        def _compute(group):
            # Garantir a série temporal ordenada
            ts = group.sort_values("date")
            

            vals = ts["consumption_kwh"].values
            times = ts["date"].astype(np.int64).values.reshape(-1, 1)
            
            # Calcular estatísticas básicas
            stats = {
                "mean": vals.mean(),
                "median": np.median(vals),
                "std": vals.std(ddof=0),
                "max": vals.max(),
                "min": vals.min(),
            }
            
            # A regressão linear modelará a relação entre tempo e consumo de energia. O objetivo é entender como o consumo está evoluindo ao longo do tempo.
            lr = LinearRegression().fit(times, vals)

            # O slope (coeficiente angular da reta) é a taxa de variação do consumo em relação ao tempo:
            # slope > 0 → tendência de crescimento
            # slope < 0 → tendência de queda
            # slope = 0 → consumo constante ao longo do tempo
            
            stats["slope"] = lr.coef_[0]
            
            return pd.Series(stats)
        
        features = df.groupby("client_id").apply(_compute).reset_index()
        return features

In [14]:
# Extrair features
fe = TemporalFeatureExtractor()
feat_df = fe.transform(df)

feat_df = feat_df.merge(clientes_df[["client_id", "region"]], on="client_id", how="inner")

  features = df.groupby("client_id").apply(_compute).reset_index()


# Divisão de dados para treino e teste

In [15]:
train_df = feat_df[feat_df['region'] != 'Desconhecida']

proportion_list = []

for region in train_df['region'].unique():
    region_df = train_df[train_df['region'] == region]
    
    # cada região deve ter pelo 15 amostras, isso deixara os dados balanceados
    proportion = region_df.sample(n=15, random_state=42)
    proportion_list.append(proportion)

train_df = pd.concat(proportion_list, ignore_index=True).reset_index(drop=True)

In [16]:
X_train = train_df.drop(columns=['client_id', 'region'])
y_train = train_df['region']

# Codificar y
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)

# Treinamento de modelos

In [17]:
# Pipeline base
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier())  # Placeholder que será substituído
])

# Scoring personalizado (F1-Score)
custom_scorer = make_scorer(f1_score, average="weighted")

# Validação cruzada estratificada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grade de parâmetros
param_grid = [
    {
        "clf": [RandomForestClassifier()],
        "clf__n_estimators": [100, 200],
        "clf__max_depth": [None, 10]
    },
    {
        "clf": [xgb.XGBClassifier(eval_metric="logloss", use_label_encoder=False)],
        "clf__n_estimators": [100, 200],
        "clf__max_depth": [3, 6]
    },
    {
        "clf": [lgb.LGBMClassifier()],
        "clf__n_estimators": [100, 200],
        "clf__num_leaves": [31, 63]
    },
    {
        "clf": [SVC()],
        "clf__C": [0.1, 1, 10],
        "clf__kernel": ["linear", "rbf"]
    }
]

# GridSearch com validação cruzada
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=custom_scorer,
    cv=cv,
    verbose=2,
    n_jobs=-1
)

In [18]:
# Define o caminho onde os dados da run serão armazenados
mlflow.set_tracking_uri("../../mlruns")

# Iniciar experimento
mlflow.set_experiment("modelo_classificacao_pipeline")

# Diretório onde será salvo
local_path = "../../models/region_classificacao/local_model"

# Verifica se a pasta existe
if os.path.exists(local_path) and os.path.isdir(local_path):
    # Exclui a pasta com segurança
    shutil.rmtree(local_path)
    

with mlflow.start_run():

    # Executa o grid_search
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Predições com validação cruzada
    y_pred = cross_val_predict(best_model, X_train, y_train, cv=cv)
    y_proba = cross_val_predict(best_model, X_train, y_train, cv=cv, method="predict_proba")

    # Cálculo das métricas
    acc = accuracy_score(y_train, y_pred)
    prec = precision_score(y_train, y_pred, average="weighted")
    rec = recall_score(y_train, y_pred, average="weighted")
    f1 = f1_score(y_train, y_pred, average="weighted")
    roc = roc_auc_score(y_train, y_proba, multi_class="ovr", average="weighted")
    cm = confusion_matrix(y_train, y_pred)

    # Log no MLflow
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metrics({
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1,
        "roc_auc": roc
    })

    # Log do modelo
    mlflow.sklearn.log_model(best_model, "best_model")
    mlflow.sklearn.save_model(best_model, path=local_path)

    # Log da matriz de confusão como artefato
    np.savetxt("confusion_matrix.csv", cm, delimiter=",")
    mlflow.log_artifact("confusion_matrix.csv")

    # Log do LabelEncoder
    label_path = "../../models/region_classificacao/label_encoder_classificador"
    os.makedirs(label_path, exist_ok=True)
    joblib.dump(label_encoder, f"{label_path}/label_encoder.pkl")
    mlflow.log_artifact(f"{label_path}/label_encoder.pkl", artifact_path="label_encoder")

    # Print das métricas
    print("📊 Métricas de avaliação:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc:.4f}")
    print("Confusion Matrix:")

2025/07/26 19:32:09 INFO mlflow.tracking.fluent: Experiment with name 'modelo_classificacao_pipeline' does not exist. Creating a new experiment.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


📊 Métricas de avaliação:
Accuracy: 0.2533
Precision: 0.2654
Recall: 0.2533
F1-Score: 0.2554
ROC-AUC: 0.5491
Confusion Matrix:
[[2 3 6 2 2]
 [5 4 1 1 4]
 [3 2 3 1 6]
 [4 3 3 4 1]
 [4 2 1 2 6]]


| Métrica             | Descrição                                                       |
|---------------------|-----------------------------------------------------------------|
| Acurácia            | Proporção de previsões corretas sobre o total                   |
| Precisão            | % de verdadeiros positivos sobre todos os positivos previstos   |
| Recall (Sensibilidade) | % de verdadeiros positivos sobre todos os reais positivos    |
| F1-Score            | Média harmônica entre precisão e recall                         |
| AUC-ROC             | Área sob a curva ROC, avalia separabilidade entre classes       |
