## Feature engineering

In [7]:
import sys
import os

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join("..")))  

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans 
from sklearn.pipeline import Pipeline
from src.data_processing import (
    CustomerAggregator, FeatureEngineer, SimpleImputerTransformer,
    QuantileBinner, WoEEncoder, create_proxy_target, create_feature_pipeline
)

# -------------------------
# Load raw data
# -------------------------
df = pd.read_csv("../data/raw/data.csv")

# -------------------------
# Task 4: Create proxy target
# -------------------------
proxy_target, cluster_summary = create_proxy_target(df)
df = df.merge(proxy_target, on="CustomerId", how="left")
y = df['is_high_risk']

# -------------------------
# Task 3: Feature Engineering Pipeline
# -------------------------
feature_pipeline = Pipeline([
    ('aggregate', CustomerAggregator()),
    ('engineer', FeatureEngineer()),
    ('impute', SimpleImputerTransformer(strategy='median')),
    ('bin', QuantileBinner(n_bins=5)),
    ('woe', WoEEncoder())
])

feature_pipeline = create_feature_pipeline(df)  
X_transformed = feature_pipeline.fit_transform(df, y=y)  
preprocessor = feature_pipeline.named_steps['preprocess']

feature_names = []

for name, transformer, cols in preprocessor.transformers_:
    if name == 'num':
        feature_names.extend(cols)

    elif name == 'cat':
        ohe = transformer.named_steps['encoder']
        feature_names.extend(ohe.get_feature_names_out(cols))

X = pd.DataFrame(
    X_transformed,
    columns=feature_names
)

# Check first few rows
X.head()

numeric_cols: ['Total_Transaction_Amount', 'Average_Transaction_Amount', 'Transaction_Count', 'Std_Transaction_Amount', 'Transaction_Recency', 'Avg_Amount_By_Category', 'Count_By_FraudResult', 'Night_Transactions', 'Amount_CV', 'Dormant_Flag', 'Night_Txn_Ratio', 'Log_Total_Amount']
categorical_cols: []


Unnamed: 0,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Recency,Avg_Amount_By_Category,Count_By_FraudResult,Night_Transactions,Amount_CV,Dormant_Flag,Night_Txn_Ratio,Log_Total_Amount
0,10000.0,10000.0,1.0,0.0,83.0,10000.0,0.0,0.0,0.0,0.0,0.0,9.21044
1,10000.0,10000.0,1.0,0.0,83.0,10000.0,0.0,0.0,0.0,0.0,0.0,9.21044
2,30400.0,6080.0,5.0,4100.243895,89.0,6080.0,0.0,0.0,0.674271,0.0,0.0,10.322231
3,4775.0,434.090909,11.0,518.805446,25.0,434.090909,0.0,2.0,1.192407,0.0,0.181818,8.471359
4,32000.0,5333.333333,6.0,3945.461528,11.0,5333.333333,0.0,0.0,0.739635,0.0,0.0,10.373522


In [8]:
X.tail(5)

Unnamed: 0,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Recency,Avg_Amount_By_Category,Count_By_FraudResult,Night_Transactions,Amount_CV,Dormant_Flag,Night_Txn_Ratio,Log_Total_Amount
3737,32000.0,5333.333333,6.0,4033.19559,4.0,5333.333333,0.0,0.0,0.756082,0.0,0.0,10.373522
3738,32000.0,6400.0,5.0,3781.53408,25.0,6400.0,0.0,0.0,0.590772,0.0,0.0,10.373522
3739,614077.0,6079.970297,101.0,14537.733039,0.0,6079.970297,0.0,3.0,2.390693,0.0,0.029703,13.327877
3740,151000.0,8882.352941,17.0,2619.216317,67.0,8882.352941,0.0,0.0,0.294845,0.0,0.0,11.925042
3741,163000.0,7409.090909,22.0,3168.431953,0.0,7409.090909,0.0,1.0,0.427583,0.0,0.045455,12.001512


## Features in the credit data


In [9]:
for col in X.columns:
    print(col)


Total_Transaction_Amount
Average_Transaction_Amount
Transaction_Count
Std_Transaction_Amount
Transaction_Recency
Avg_Amount_By_Category
Count_By_FraudResult
Night_Transactions
Amount_CV
Dormant_Flag
Night_Txn_Ratio
Log_Total_Amount


In [10]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: is_high_risk, dtype: int64

In [11]:
import mlflow

# Set experiment
mlflow.set_experiment("Credit_Risk_Scoring")

with mlflow.start_run() as run:
    run_id = run.info.run_id
    print("RUN_ID:", run_id)

RUN_ID: c4397c60cb47482797c6a8b2353c4bf3


In [5]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join("..")))

import mlflow
import pandas as pd
from mlflow.tracking import MlflowClient

from src.train import (
    split_data,
    train_logistic_regression,
    train_decision_tree
)

# ============================
# SAFETY CHECK
# ============================
print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())

assert X.shape[0] == y.shape[0], "X and y row mismatch!"

# ============================
# Set MLflow Experiment
# ============================
mlflow.set_experiment("Credit_Risk_Scoring")

# ============================
# Train/Test Split
# ============================
X_train, X_test, y_train, y_test = split_data(
    X, y, test_size=0.2, random_state=42
)

# ============================
# Train Models
# ============================
lr_model, lr_metrics = train_logistic_regression(
    X_train, y_train, X_test, y_test
)

dt_model, dt_metrics = train_decision_tree(
    X_train, y_train, X_test, y_test
)

print("Logistic Regression metrics:", lr_metrics)
print("Decision Tree metrics:", dt_metrics)

# ============================
# Select BEST model by F1 score
# ============================
if lr_metrics["f1"] >= dt_metrics["f1"]:
    best_model_name = "Logistic_Regression"
    best_score = lr_metrics["f1"]
else:
    best_model_name = "Decision_Tree"
    best_score = dt_metrics["f1"]

print(f"Best Model: {best_model_name}")
print(f"Best F1 Score: {best_score:.4f}")

# ============================
# Register BEST model in MLflow
# ============================
client = MlflowClient()
experiment = client.get_experiment_by_name("Credit_Risk_Scoring")

runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.f1 DESC"]
)

# keep only runs with f1 logged
valid_runs = [r for r in runs if "f1" in r.data.metrics]

if not valid_runs:
    raise ValueError("No MLflow runs found with F1 metric.")

best_run = max(
    valid_runs,
    key=lambda r: r.data.metrics["f1"]
)

print("Registering Run ID:", best_run.info.run_id)
print("F1 Score:", best_run.data.metrics["f1"])

mlflow.register_model(
    model_uri=f"runs:/{best_run.info.run_id}/model",
    name="Credit_Risk_Model"
)


X shape: (3742, 12)
y distribution:
 is_high_risk
0    2316
1    1426
Name: count, dtype: int64


Registered model 'Credit_Risk_Model' already exists. Creating a new version of this model...


Logistic Regression metrics: {'accuracy': 0.9572763684913218, 'precision': 0.9347079037800687, 'recall': 0.9543859649122807, 'f1': 0.9444444444444444, 'roc_auc': 0.9941545674531156}
Decision Tree metrics: {'accuracy': 0.9919893190921228, 'precision': 0.99644128113879, 'recall': 0.9824561403508771, 'f1': 0.9893992932862191, 'roc_auc': 0.990150483968542}
Best Model: Decision_Tree
Best F1 Score: 0.9894
Registering Run ID: dfebdf235bdc4f2db69bb272a2495b06
F1 Score: 0.9893992932862191


Created version '8' of model 'Credit_Risk_Model'.


<ModelVersion: aliases=[], creation_timestamp=1765874607681, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1765874607681, metrics=None, model_id=None, name='Credit_Risk_Model', params=None, run_id='dfebdf235bdc4f2db69bb272a2495b06', run_link=None, source='models:/m-1038f925d33b4318975dda7db62898c3', status='READY', status_message=None, tags={}, user_id=None, version=8>