## Feature engineering

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans 
from sklearn.pipeline import Pipeline
from src.data_processing import (
    CustomerAggregator, FeatureEngineer, SimpleImputerTransformer,
    QuantileBinner, WoEEncoder, create_proxy_target
)

# -------------------------
# Load raw data
# -------------------------
df = pd.read_csv("../data/raw/data.csv")

# -------------------------
# Task 4: Create proxy target
# -------------------------
proxy_target, cluster_summary = create_proxy_target(df)
df = df.merge(proxy_target, on="CustomerId", how="left")
y = df['is_high_risk']

# -------------------------
# Task 3: Feature Engineering Pipeline
# -------------------------
feature_pipeline = Pipeline([
    ('aggregate', CustomerAggregator()),
    ('engineer', FeatureEngineer()),
    ('impute', SimpleImputerTransformer(strategy='median')),
    ('bin', QuantileBinner(n_bins=5)),
    ('woe', WoEEncoder())
])

X = feature_pipeline.fit_transform(df, y=y)

# Check first few rows
X.head()
