# 1. Calculate RFM Metrics

In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('../data/raw/data.csv', parse_dates=['TransactionStartTime'])

# Define a fixed snapshot date (after the latest transaction)
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)

# Compute RFM
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  # Recency
    'TransactionId': 'count',                                          # Frequency
    'Amount': 'sum'                                                    # Monetary
}).reset_index()

# Rename columns
rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

# Replace negative Monetary (if credits are refunds or reversals)
rfm['Monetary'] = rfm['Monetary'].abs()


# 2. Scale RFM Features

In [2]:
from sklearn.preprocessing import StandardScaler

# Scale RFM values
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])


# 3. Cluster with K-Means

In [3]:
from sklearn.cluster import KMeans

# KMeans clustering into 3 customer segments
kmeans = KMeans(n_clusters=3, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)


# 4. Identify the High-Risk Cluster

In [4]:
# Add cluster means for inspection
cluster_profile = rfm.groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean().reset_index()
print(cluster_profile)

high_risk_cluster = cluster_profile.sort_values(by=['Recency', 'Frequency', 'Monetary'], ascending=[False, True, True]).iloc[0]['Cluster']


   Cluster    Recency    Frequency      Monetary
0        0  61.859846     7.726699  8.289314e+04
1        1  29.000000  4091.000000  1.049000e+08
2        2  12.716076    34.807692  2.813162e+05


# 5. Create is_high_risk Column

In [5]:
rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)


# 6. Merge Back to Main Dataset

In [6]:
# Keep only CustomerId and the new target column
proxy_target = rfm[['CustomerId', 'is_high_risk']]

# Merge into processed dataset
df_final = df.merge(proxy_target, on='CustomerId', how='left')

# Fill missing with 0 (if any customers weren’t assigned a cluster)
df_final['is_high_risk'] = df_final['is_high_risk'].fillna(0).astype(int)
