# Feature Engineering: Customer-Level Feature Creation
## This notebook performs feature engineering on the raw transaction data.
## It loads the raw data, applies feature engineering transformations,
## and visualizes key aspects of the engineered features.

In [None]:
# Feature Engineering: Customer-Level Feature Creation
## This notebook performs feature engineering on the raw transaction data.
## It loads the raw data, applies feature engineering transformations,
## and visualizes key aspects of the engineered features.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Ensure src is importable (relative to notebook location)
sys.path.append(str((Path('..') / 'src').resolve()))
from data_processing import DataLoader
from feature_engineering import feature_engineering_pipeline, WoEIVCalculator

# 1. Load raw transaction data
data_path = Path('..') / 'data' / 'raw' / 'data.csv'
if not data_path.exists():
    raise FileNotFoundError(f"Raw data file not found: {data_path.resolve()}")
loader = DataLoader(str(data_path))
raw_df = loader.load_data()

print(f"Raw data path: {data_path.resolve()}")
print(f"Raw data shape: {raw_df.shape}")
display(raw_df.head())

# Check for missing values
missing = raw_df.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)
if not missing.empty:
    print("Columns with missing values (count):")
    display(missing)
print('Columns and dtypes:')
display(raw_df.dtypes)


In [None]:

## Aggregate, Temporal, Encoded Feature Creation & WoE/IV integration

# 2. Run feature engineering pipeline
# Only keep categorical columns that exist in raw_df
auto_cats = [col for col in ['ProductCategory', 'ChannelId', 'ProviderId'] if col in raw_df.columns]
print("Categorical columns used:", auto_cats)

feat_df, feat_desc = feature_engineering_pipeline(
    raw_df, categorical_cols=auto_cats
)

print("Engineered features shape:", feat_df.shape)
display(feat_df.head(10))

# Save for downstream steps
feat_df.to_csv("../data/processed/features.csv", index=False)
feat_desc.to_csv("../data/processed/features_description.csv", index=False)
print("Features saved to ../data/processed/features.csv")
print("Feature descriptions saved to ../data/processed/features_description.csv")


In [None]:

## Feature Description Table

print("Feature Description Table:")
display(feat_desc)

## WoE/IV Feature Analysis
woe_cols = [col for col in feat_df.columns if col.endswith("_woe")]
if woe_cols:
    print("WoE Features created:", woe_cols)
    display(feat_df[woe_cols + ['CustomerId']].head())
    # Show Information Value (IV) if present in feature descriptions
    iv_rows = feat_desc[feat_desc['Feature'].str.endswith("_woe")]
    if not iv_rows.empty:
        display(iv_rows[['Feature', 'Description']])
    # Plot distribution of WoE features
    fig, axes = plt.subplots(len(woe_cols), 1, figsize=(8, 3*len(woe_cols)))
    if len(woe_cols) == 1:
        axes = [axes]
    for ax, col in zip(axes, woe_cols):
        sns.histplot(feat_df[col], bins=20, ax=ax)
        ax.set_title(f'WoE Feature: {col}')
    plt.tight_layout()
    plt.show()
else:
    print("No WoE features found to plot.")


In [None]:

## Numeric Feature Distributions: Before and After Transformations

# BEFORE transformation: All raw numeric features
num_cols_before = [
    col for col in raw_df.select_dtypes(include=[float, int]).columns
    if not col.lower().startswith(('id', 'date', 'time'))
]

if not num_cols_before:
    print('No numeric feature columns found in raw data.')
else:
    fig_width = max(16, 12 * len(num_cols_before))
    fig_height = 8
    fig_before, axes_before = plt.subplots(1, len(num_cols_before), figsize=(fig_width, fig_height))
    if len(num_cols_before) == 1:
        axes_before = [axes_before]
    for i, col in enumerate(num_cols_before):
        sns.histplot(raw_df[col].dropna(), bins=40, kde=True, ax=axes_before[i], color='tab:blue')
        axes_before[i].set_title(f"{col} (Raw)", fontsize=20)
        axes_before[i].set_xlabel('Value', fontsize=16)
        axes_before[i].set_ylabel('Count', fontsize=16)
        axes_before[i].tick_params(axis='both', which='major', labelsize=14)
    fig_before.suptitle("Numeric Feature Distributions (Raw Data)", fontsize=24, y=1.03)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

# AFTER transformation: All engineered numeric features (log/std)
num_cols_after = [
    col for col in feat_df.select_dtypes(include=[float, int]).columns
    if not col.lower().startswith('customerid') and
       ('_log_std' in col or '_log' in col or '_std' in col)
]
if not num_cols_after:
    print('No transformed numeric columns found in engineered features.')
else:
    fig_width = max(16, 12 * len(num_cols_after))
    fig_height = 8
    fig_after, axes_after = plt.subplots(1, len(num_cols_after), figsize=(fig_width, fig_height))
    if len(num_cols_after) == 1:
        axes_after = [axes_after]
    for i, col in enumerate(num_cols_after):
        sns.histplot(feat_df[col].dropna(), bins=40, kde=True, ax=axes_after[i], color='tab:green')
        axes_after[i].set_title(f"{col.replace('_log_std','')} (Transformed)", fontsize=20)
        axes_after[i].set_xlabel('Value', fontsize=16)
        axes_after[i].set_ylabel('Count', fontsize=16)
        axes_after[i].tick_params(axis='both', which='major', labelsize=14)
    fig_after.suptitle("Numeric Feature Distributions (Transformed)", fontsize=24, y=1.03)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()


In [None]:

## Calendar (Day/Month/Year) temporal feature check
temporal_cols = [c for c in feat_df.columns if any(k in c for k in ['Day_', 'Month_', 'Year_'])]
print("Extracted temporal features:", temporal_cols)
display(feat_df[temporal_cols + ['CustomerId']].head())
if temporal_cols:
    fig, axes = plt.subplots(1, len(temporal_cols), figsize=(5*len(temporal_cols), 4))
    if len(temporal_cols) == 1:
        axes = [axes]
    for i, col in enumerate(temporal_cols):
        sns.histplot(feat_df[col], bins=20, ax=axes[i])
        axes[i].set_title(f"Temporal: {col}")
    plt.tight_layout()
    plt.show()


In [None]:

## Boxplots to Visualize Outliers

if num_cols_before:
    fig, axes = plt.subplots(1, len(num_cols_before), figsize=(5*len(num_cols_before),4))
    if len(num_cols_before) == 1:
        axes = [axes]
    for i, col in enumerate(num_cols_before):
        sns.boxplot(y=raw_df[col], ax=axes[i], color='tab:orange')
        axes[i].set_title(f"{col} Boxplot (Raw)")
    plt.tight_layout()
    plt.show()


In [None]:

## Correlation Heatmap of Engineered Numeric Features

numeric_cols = feat_df.select_dtypes(include='number').columns.difference(['CustomerId'])
if len(numeric_cols) > 1:
    plt.figure(figsize=(10,8))
    corr = feat_df[numeric_cols].corr()
    sns.heatmap(corr, annot=False, cmap='coolwarm', center=0)
    plt.title("Correlation Heatmap of Engineered Features")
    plt.tight_layout()
    plt.show()
else:
    print("Not enough numeric columns for correlation heatmap.")

## Barplots: Encoded Categorical Features

cat_encode_cols = [
    c for c in feat_df.columns
    if any(k in c for k in ['ProductCategory', 'ChannelId', 'ProviderId'])
    and c != 'CustomerId'
]

if cat_encode_cols:
    fig, axes = plt.subplots(len(cat_encode_cols), 1, figsize=(8, 2*len(cat_encode_cols)))
    if len(cat_encode_cols) == 1:
        axes = [axes]
    for ax, col in zip(axes, cat_encode_cols):
        sns.histplot(feat_df[col], bins=20, ax=ax)
        ax.set_title(f'Encoded Feature: {col}')
    plt.tight_layout()
    plt.show()
else:
    print("No encoded categorical features found to plot.")


In [None]:

# Summary

print("\nFeatures engineered and saved: ../data/processed/features.csv")
print("Feature documentation saved: ../data/processed/features_description.csv")
print("Relevant plots and tables provided for review/audit.")