In [1]:
import pandas as pd
import duckdb

from sklearn.preprocessing import StandardScaler

In [2]:
# Caminho do banco
db_path = "../../data/duckdb/database.duckdb"

# Conexão com o banco DuckDB
con = duckdb.connect(db_path)

# Carrega os dados da camada bronze
df = con.execute("SELECT * FROM gold.consumo_geral").df()

# Feature Engineering

In [3]:
# Agrupa por cliente e extrai features relevantes
input_df = df.groupby('client_id').agg({
    'consumption_kwh': ['mean'],
    'temperature': 'mean',
    'humidity': 'mean'
}).reset_index()

input_df.columns = ['client_id', 'mean_consumption', 'mean_temperature', 'mean_humidity']

input_df.head()

Unnamed: 0,client_id,mean_consumption,mean_temperature,mean_humidity
0,C0000,16.252611,24.993203,60.174688
1,C0001,18.926167,24.993203,60.174688
2,C0002,18.624611,25.067315,59.734148
3,C0003,15.284,24.993203,60.174688
4,C0004,14.4465,25.17983,60.064383


In [4]:
# Normaliza os dados (exceto client_id)
features = input_df.drop(columns=['client_id'])
scaler = StandardScaler()
features_scaled  = scaler.fit_transform(features)

# Concatena novamente com client_id
input_df = pd.concat(
    [input_df[['client_id']].reset_index(drop=True), pd.DataFrame(features_scaled, columns=features.columns)],
    axis=1
)

# Salvar dados de Feature

In [5]:
con.execute("""
CREATE OR REPLACE TABLE feature.clusterizacao_cliente (
    client_id VARCHAR,
    mean_consumption DOUBLE,
    mean_temperature DOUBLE,
    mean_humidity DOUBLE
)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1aac70cc730>

In [6]:
# Limpa dados se as tabelas já existirem
con.execute("DELETE FROM feature.clusterizacao_cliente")

<duckdb.duckdb.DuckDBPyConnection at 0x1aac70cc730>

In [7]:
# Registra como tabelas temporárias
con.register("input_df", input_df)

<duckdb.duckdb.DuckDBPyConnection at 0x1aac70cc730>

In [8]:
# Insere os dados nas tabelas gold
con.execute("INSERT INTO feature.clusterizacao_cliente SELECT * FROM input_df")

<duckdb.duckdb.DuckDBPyConnection at 0x1aac70cc730>

In [9]:
con.close()