In [None]:
# Feature Engineering Notebook for Credit Scoring

import sys
sys.path.append("../src")
import pandas as pd
from data_processing import DataLoader
from feature_engineering import build_pipeline, FeatureEngineer

# 1. Load Data
data_path = "../data/raw/data.csv"  # Adjust if needed
df = DataLoader(data_path).load_data()
print(f"Raw data shape: {df.shape}")
display(df.head())


In [None]:

# 2. Setup and Run Feature Engineering Pipeline
categorical_cols = ['ProductCategory', 'ChannelId', 'CountryCode']

pipe = build_pipeline(
    customer_id_col='CustomerId',
    time_col='TransactionStartTime',
    amount_col='Amount',
    value_col='Value',
    encode_type='onehot',      # or "label"
    scaler_type='standard',    # or "minmax"
    impute_strategy='mean',    # other options: "median", "zero"
    categorical_cols=categorical_cols,
)

features_df = pipe.fit_transform(df)
print(f"Engineered features shape: {features_df.shape}")
display(features_df.head())


### WoE auto-mapping
This cell will automatically create a binary target column `is_high_risk` from `FraudResult` if present.
Mapping: values containing keywords like 'fraud', 'chargeback', 'blocked', 'suspicious', 'true', 'yes', or '1' are labeled 1 (high risk), else 0.
If you prefer a different mapping, change the `_map_fraud` function in the following code cell.


In [None]:

# 3. (Optional) WoE
# Compute WoE on the raw rows, aggregate per customer, and merge into features_df.
# This avoids KeyError by running WoE on the original raw dataframe `df` which contains
# the per-transaction categorical columns and the target column (default 'is_high_risk').
# Set `target_col` to the name of your target column if different.
target_col = 'is_high_risk'
if target_col not in df.columns:
    # If a label column isn't present, try to auto-create one from common label columns
    if 'FraudResult' in df.columns:
        print("Detected 'FraudResult' column â€” creating 'is_high_risk' using a suggested mapping.")
        # suggested mapping: values that contain common fraud indicators will be labeled 1, else 0
        def _map_fraud(v):
            s = str(v).lower() if pd.notna(v) else ''
            keywords = ['fraud', 'fraudulent', 'chargeback', 'blocked', 'suspicious', 'true', 'yes', '1']
            return int(any(k in s for k in keywords))
        df['is_high_risk'] = df['FraudResult'].apply(_map_fraud)
        target_col = 'is_high_risk'
        print("Created 'is_high_risk' from 'FraudResult' (1 = high risk).")
    else:
        print(f"Target column '{target_col}' not found in raw dataframe 'df'.")
        print('Available columns:')
        print(df.columns.tolist())
        print('  - set `target_col` to an existing column name, or')
        print('  - create a binary target column (example below) and re-run this cell')
        # Example: create a demo target column based on Amount (uncomment to use)
        # df['is_high_risk'] = (df['Amount'] > df['Amount'].median()).astype(int)
else:
    # Use existing FeatureEngineer if available, otherwise try to extract from pipeline or instantiate
    if 'fe' in globals():
        # existing fe will be used
        pass
    else:
        fe = None
        # Try to get FeatureEngineer instance from the pipeline if available
        if 'pipe' in globals():
            try:
                fe = pipe.named_steps.get('feature_engineer', None)
            except Exception:
                fe = None

        # If still not found, create a new FeatureEngineer
        if fe is None:
            fe = FeatureEngineer(
                customer_id_col='CustomerId',
                time_col='TransactionStartTime',
                amount_col='Amount',
                value_col='Value',
                encode_type='onehot',
                scaler_type='standard',
                impute_strategy='mean',
                categorical_cols=categorical_cols,
                woe_cols=['ProductCategory', 'ChannelId']
            )

    # Run WoE on the raw transaction-level dataframe
    raw_with_woe = fe.add_weight_of_evidence(df.copy(), target_col=target_col)

    # Determine which columns were added/produced by the WOE transformation
    orig_cols = set(df.columns)
    transformed_cols = [c for c in raw_with_woe.columns if c not in orig_cols]

    if not transformed_cols:
        # Fallback: WOE may replace original categorical columns; look for woe_cols present in result
        transformed_cols = [c for c in getattr(fe, "woe_cols", []) if c in raw_with_woe.columns]

    if not transformed_cols:
        print("No WoE-derived columns detected in the transformed dataframe. Nothing to merge.")
    else:
        # Aggregate transformed (row-level) values to customer level. Here we use last observed value
        # per customer; change to .mean() or another aggregator if preferred.
        cust_woe = (
            raw_with_woe[[fe.customer_id_col] + transformed_cols]
            .sort_values(fe.time_col)
            .groupby(fe.customer_id_col)
            .mean()
            .reset_index()
        )

        # Ensure customer id column exists in features_df before merging
        if fe.customer_id_col not in features_df.columns:
            raise KeyError(
                f"Customer id column '{fe.customer_id_col}' not found in engineered features (features_df)."
            )

        # Merge customer-level WoE features into the engineered features dataframe
        features_df = features_df.merge(cust_woe, on=fe.customer_id_col, how='left')

        # Output results
        print(f'Engineered features shape after merge: {features_df.shape}')
        print('Added WoE columns:', transformed_cols)
        display(features_df.head())


In [None]:

# 4. Save the processed dataframe for later modeling
save_path = "../data/processed/engineered_features.csv"
features_df.to_csv(save_path, index=False)
print(f"Processed features saved to {save_path}")