# import module

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


# import data processing

In [2]:
import importlib.util
import sys
import os
module_path = r'C:\Users\user\Desktop\Project\Bati_Bank\src\data_processing.py'
spec = importlib.util.spec_from_file_location("data_loader", module_path)
data_loader = importlib.util.module_from_spec(spec)
spec.loader.exec_module(data_loader)

# now you can access your function
load_data = data_loader.load_data

# test
print(load_data)

<function load_data at 0x0000017FFCECCF40>


# load data

In [3]:
df = load_data(r"C:\Users\user\Desktop\Project\Bati_Bank\data\raw\loan.csv")

# Prepare Date Column & Snapshot Date

In [4]:
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])

# Snapshot date = 1 day after last transaction
snapshot_date = df["TransactionStartTime"].max() + pd.Timedelta(days=1)


# Calculate RFM Metrics

In [5]:
rfm = (
    df.groupby("CustomerId")
    .agg(
        Recency=("TransactionStartTime",
                 lambda x: (snapshot_date - x.max()).days),
        Frequency=("TransactionId", "count"),
        Monetary=("Amount", "sum")
    )
    .reset_index()
)

rfm.head()


Unnamed: 0,CustomerId,Recency,Frequency,Monetary
0,CustomerId_1,84,1,-10000.0
1,CustomerId_10,84,1,-10000.0
2,CustomerId_1001,90,5,20000.0
3,CustomerId_1002,26,11,4225.0
4,CustomerId_1003,12,6,20000.0


# Scale RFM Features (Required for K-Means)

In [6]:
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(
    rfm[["Recency", "Frequency", "Monetary"]]
)


# K-Means Clustering (3 Clusters)

In [7]:
kmeans = KMeans(
    n_clusters=3,
    random_state=42,
    n_init=10
)

rfm["cluster"] = kmeans.fit_predict(rfm_scaled)


# Identify High-Risk (Least Engaged) Cluster

In [8]:
cluster_summary = (
    rfm.groupby("cluster")[["Recency", "Frequency", "Monetary"]]
    .mean()
)

cluster_summary


Unnamed: 0_level_0,Recency,Frequency,Monetary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,61.877279,7.720196,81720.68
1,12.726566,34.8,272574.1
2,29.0,4091.0,-104900000.0


# High-Risk Cluster Characteristics

## -High Recency

## -Low Frequency

## -Low Monetary

In [10]:
high_risk_cluster = cluster_summary["Frequency"].idxmin()


# Create Binary Proxy Target

In [11]:
rfm["is_high_risk"] = np.where(
    rfm["cluster"] == high_risk_cluster, 1, 0
)

rfm["is_high_risk"].value_counts()


is_high_risk
0    2316
1    1426
Name: count, dtype: int64

# Merge Target Back into Main Dataset

In [12]:
df = df.merge(
    rfm[["CustomerId", "is_high_risk"]],
    on="CustomerId",
    how="left"
)


# result

In [13]:
df[["CustomerId", "is_high_risk"]].head()


Unnamed: 0,CustomerId,is_high_risk
0,CustomerId_4406,0
1,CustomerId_4406,0
2,CustomerId_4683,1
3,CustomerId_988,0
4,CustomerId_988,0
