In [1]:
import pandas as pd
import duckdb

from sklearn.preprocessing import StandardScaler

In [2]:
# Caminho do banco
db_path = "../../../data/duckdb/database.duckdb"

# Conexão com o banco DuckDB
con = duckdb.connect(db_path)

# Carrega os dados da camada bronze
df = con.execute("SELECT * FROM gold.consumo_geral").df()

# Feature Engineering

In [3]:
# Agrupa por cliente e extrai features relevantes
input_df = df.groupby('client_id').agg({
    'consumption_kwh': ['mean'],
    'temperature': 'mean',
    'humidity': 'mean',
    'region': 'first'
}).reset_index()

input_df.columns = ['client_id', 'mean_consumption', 'mean_temperature', 'mean_humidity', 'region']

input_df.head()

Unnamed: 0,client_id,mean_consumption,mean_temperature,mean_humidity,region
0,C0000,16.252611,24.993203,60.174688,Norte
1,C0001,18.926167,24.993203,60.174688,Norte
2,C0002,18.624611,25.067315,59.734148,Sul
3,C0003,15.284,24.993203,60.174688,Norte
4,C0004,14.4465,25.17983,60.064383,Oeste


In [4]:
# One-hot encoding da coluna de região (sem prefixo)
input_df = pd.get_dummies(input_df, columns=['region'], prefix='', prefix_sep='')

In [5]:
# Normaliza os dados (exceto client_id)
features = input_df.drop(columns=['client_id'])
scaler = StandardScaler()
features_scaled  = scaler.fit_transform(features)

# Concatena novamente com client_id
input_df = pd.concat(
    [input_df[['client_id']].reset_index(drop=True), pd.DataFrame(features_scaled, columns=features.columns)],
    axis=1
)

# Salvar dados de Feature

In [12]:
con.execute("""
CREATE TABLE IF NOT EXISTS feature.clusterizacao_cliente (
    client_id VARCHAR,
    mean_consumption DOUBLE,
    mean_temperature DOUBLE,
    mean_humidity DOUBLE,
    Centro DOUBLE,
    Leste DOUBLE,
    Norte DOUBLE,
    Oeste DOUBLE,
    Sul DOUBLE
)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x183d594cbf0>

In [13]:
# Limpa dados se as tabelas já existirem
con.execute("DELETE FROM feature.clusterizacao_cliente")

<duckdb.duckdb.DuckDBPyConnection at 0x183d594cbf0>

In [14]:
# Registra como tabelas temporárias
con.register("input_df", input_df)

<duckdb.duckdb.DuckDBPyConnection at 0x183d594cbf0>

In [15]:
# Insere os dados nas tabelas gold
con.execute("INSERT INTO feature.clusterizacao_cliente SELECT * FROM input_df")

<duckdb.duckdb.DuckDBPyConnection at 0x183d594cbf0>

In [16]:
con.close()