In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [3]:
# 1. Load the data from the previous step
df = pd.read_csv('../data/processed/featured_data.csv', parse_dates=['TransactionStartTime'])

# 2. Define Snapshot Date
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)

# 3. Calculate RFM Metrics per Customer
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days, # Recency
    'TransactionId': 'count',                                        # Frequency
    'Amount': 'sum'                                                  # Monetary
}).reset_index()

# Rename columns
rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

print("RFM table sample:")
print(rfm.head())

RFM table sample:
        CustomerId  Recency  Frequency  Monetary
0     CustomerId_1       84          1  -10000.0
1    CustomerId_10       84          1  -10000.0
2  CustomerId_1001       90          5   20000.0
3  CustomerId_1002       26         11    4225.0
4  CustomerId_1003       12          6   20000.0


In [4]:
# 1. Pre-process: Scale the RFM features
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

# 2. Run K-Means Clustering (3 groups)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
rfm['Cluster'] = kmeans.fit_transform(rfm_scaled).argmin(axis=1) # Initial label assignment
rfm['Cluster'] = kmeans.labels_

# 3. Analyze clusters to identify "High-Risk"
cluster_stats = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).reset_index()

print("\nCluster Statistics (Mean values):")
print(cluster_stats)


Cluster Statistics (Mean values):
   Cluster    Recency    Frequency      Monetary
0        0  61.877279     7.720196  8.172068e+04
1        1  12.726566    34.800000  2.725741e+05
2        2  29.000000  4091.000000 -1.049000e+08


In [5]:
# --- LOGIC TO IDENTIFY HIGH RISK ---
# We find the cluster index with the highest average Recency
high_risk_cluster = cluster_stats.loc[cluster_stats['Recency'].idxmax(), 'Cluster']

# Create the binary target variable
rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)

print(f"\nCluster {high_risk_cluster} identified as High-Risk.")
print(f"Count of High-Risk labels: {rfm['is_high_risk'].sum()} out of {len(rfm)}")


Cluster 0 identified as High-Risk.
Count of High-Risk labels: 1426 out of 3742


In [6]:
# 1. Merge the label back to the original transaction-level dataframe
df_final = df.merge(rfm[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')

# 2. Save the labeled data to CSV
df_final.to_csv('../data/processed/labeled_data.csv', index=False)

print("\nTarget variable integrated successfully. File saved as 'labeled_data.csv'.")


Target variable integrated successfully. File saved as 'labeled_data.csv'.
