In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.proxy_target_engineering import ProxyTargetEngineer
from src.data_processing import ModelReadyDataProcessor


In [2]:
import pandas as pd


df = pd.read_csv("../data/raw/data.csv")

proxy_engineer = ProxyTargetEngineer(
    customer_col="CustomerId",
    date_col="TransactionStartTime",
    amount_col="Amount",
    n_clusters=3,
    random_state=42
)

# Fit proxy target model
proxy_engineer.fit(df)

# Inspect clusters (sanity check)
proxy_engineer.rfm_df_.groupby("cluster")[["Recency", "Frequency", "Monetary"]].mean()

Unnamed: 0_level_0,Recency,Frequency,Monetary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,61.877279,7.720196,81720.68
1,12.726566,34.8,272574.1
2,29.0,4091.0,-104900000.0


In [4]:
# Identify high-risk cluster explicitly
HIGH_RISK_CLUSTER = 0

# Create proxy target
proxy_engineer.rfm_df_["is_high_risk"] = (
    proxy_engineer.rfm_df_["cluster"] == HIGH_RISK_CLUSTER
).astype(int)

# customer-level proxy target
customer_target = proxy_engineer.rfm_df_[["CustomerId", "is_high_risk"]]

# Merge at transaction-level
df = df.merge(customer_target, on="CustomerId", how="left")

# Sanity check
df[["CustomerId", "is_high_risk"]].head()


Unnamed: 0,CustomerId,is_high_risk
0,CustomerId_4406,0
1,CustomerId_4406,0
2,CustomerId_4683,1
3,CustomerId_988,0
4,CustomerId_988,0


In [5]:
X_raw = df.drop(columns=["is_high_risk"])  # all features
y = df["is_high_risk"].astype(int)         # new proxy target


In [6]:
processor = ModelReadyDataProcessor(
    
)

x_processed = processor.fit(X_raw, y)
X_model_ready = processor.transform(X_raw)
processor.save_processed(X_model_ready)

X_model_ready.head()

✅ Saved to D:\AI mastery\credit-risk-model\data\processed\processed.csv


Unnamed: 0,Value,total_transaction_amount,average_transaction_amount,transaction_count,std_transaction_amount,transaction_hour,transaction_day,transaction_month,transaction_year,CurrencyCode_UGX,...,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4,ProductCategory_woe,ChannelId_woe
0,-0.072291,0.170118,-0.067623,-0.311831,-0.167016,2,15,11,2018,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.15115,-0.021516
1,-0.080251,0.170118,-0.067623,-0.311831,-0.167016,2,15,11,2018,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.113799,0.054063
2,-0.076352,0.165122,-0.072568,-0.444993,-0.201209,2,15,11,2018,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.15115,-0.021516
3,0.096648,0.175567,-0.008155,-0.40402,-0.008243,3,15,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.253605,-0.021516
4,-0.075183,0.175567,-0.008155,-0.40402,-0.008243,3,15,11,2018,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.113799,0.054063
