In [6]:
# Import necessary packages
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [7]:
# Load your customer-level dataset
df = pd.read_csv("../data/raw/featured.csv")

# Preview structure
df.head()


Unnamed: 0,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount,CurrencyCode_UGX,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,CustomerId,AnyFraud
0,-0.066891,-0.153364,-0.253459,0.0,0.0,2.073016,-0.228416,-1.904511,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,3.137802,-2.80444,-0.147303,CustomerId_1,0
1,-0.066891,-0.153364,-0.253459,0.0,0.0,2.073016,-0.228416,-1.904511,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,3.137802,-2.80444,-0.147303,CustomerId_10,0
2,-0.055849,-0.06987,-0.212186,-0.105976,0.0,-0.039359,-0.228416,0.268985,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,0.480118,-0.342663,-0.147303,CustomerId_1001,0
3,-0.061655,-0.091435,-0.150278,-0.168036,0.0,0.152675,-0.228416,0.071395,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,1.124405,-0.939458,-0.147303,CustomerId_1002,0
4,-0.055849,-0.073846,-0.201868,-0.111444,0.0,0.312703,-0.228416,-0.093264,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,0.18482,-0.069133,-0.147303,CustomerId_1003,0


In [None]:
# Create RFM-like DataFrame
# Select RFM-related features
rfm_features = df[['CustomerId', 'TotalTransactionAmount', 'TransactionCount', 'AvgTransactionAmount']].copy()

# Rename for clarity
rfm_features.rename(columns={
    'TotalTransactionAmount': 'Monetary',
    'TransactionCount': 'Frequency',
    'AvgTransactionAmount': 'Recency'  # Proxy for recency
}, inplace=True)

# Show a preview
rfm_features.head()

Unnamed: 0,CustomerId,Monetary,Frequency,Recency
0,CustomerId_1,-0.066891,-0.253459,-0.153364
1,CustomerId_10,-0.066891,-0.253459,-0.153364
2,CustomerId_1001,-0.055849,-0.212186,-0.06987
3,CustomerId_1002,-0.061655,-0.150278,-0.091435
4,CustomerId_1003,-0.055849,-0.201868,-0.073846


In [13]:
# Select only the numeric RFM features for clustering
rfm_numeric = rfm_features[['Recency', 'Frequency', 'Monetary']]

# Now apply scaling on these numeric features
scaler = StandardScaler()
rfm_scaled = pd.DataFrame(scaler.fit_transform(rfm_numeric), columns=rfm_numeric.columns, index=rfm_numeric.index)

# Apply KMeans clustering on scaled data
kmeans = KMeans(n_clusters=3, random_state=42)
rfm_scaled['cluster'] = kmeans.fit_predict(rfm_scaled)

# Add the cluster labels back to the original rfm_features dataframe
rfm_features['cluster'] = rfm_scaled['cluster']

print(rfm_features['cluster'].value_counts())


cluster
0    3740
2       1
1       1
Name: count, dtype: int64


In [14]:
# Add back the original RFM numeric columns for analysis if not already present
rfm_features[['Recency', 'Frequency', 'Monetary']] = rfm_numeric[['Recency', 'Frequency', 'Monetary']]

# Calculate the mean RFM values for each cluster to understand their profiles
cluster_summary = rfm_features.groupby('cluster')[['Recency', 'Frequency', 'Monetary']].mean()
print(cluster_summary)


           Recency  Frequency   Monetary
cluster                                 
0        -0.013626  -0.011165   0.004432
1        -0.246649  41.947752 -38.672798
2        51.206316  -0.191550  22.098748


In [15]:
# Assuming you identify the high-risk cluster label, for example:
high_risk_cluster = cluster_summary['Recency'].idxmax()

# Create the binary target column
rfm_features['is_high_risk'] = (rfm_features['cluster'] == high_risk_cluster).astype(int)

# Check counts
print(rfm_features['is_high_risk'].value_counts())


is_high_risk
0    3741
1       1
Name: count, dtype: int64


In [None]:
# Merge the is_high_risk label back into the main processed dataset using CustomerId
processed_df = df.merge(
    rfm_features[['CustomerId', 'is_high_risk']],
    on='CustomerId',
    how='left'
)

# Fill any missing is_high_risk values with 0 (if any CustomerId wasn't in RFM)
processed_df['is_high_risk'] = processed_df['is_high_risk'].fillna(0).astype(int)

# Check to confirm merge
print(processed_df['is_high_risk'].value_counts())
processed_df.head()


is_high_risk
0    3741
1       1
Name: count, dtype: int64


Unnamed: 0,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount,CurrencyCode_UGX,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,...,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,CustomerId,AnyFraud,is_high_risk
0,-0.066891,-0.153364,-0.253459,0.0,0.0,2.073016,-0.228416,-1.904511,-0.099954,-0.01635,...,-0.052922,-0.172187,-0.230127,-0.12953,3.137802,-2.80444,-0.147303,CustomerId_1,0,0
1,-0.066891,-0.153364,-0.253459,0.0,0.0,2.073016,-0.228416,-1.904511,-0.099954,-0.01635,...,-0.052922,-0.172187,-0.230127,-0.12953,3.137802,-2.80444,-0.147303,CustomerId_10,0,0
2,-0.055849,-0.06987,-0.212186,-0.105976,0.0,-0.039359,-0.228416,0.268985,-0.099954,-0.01635,...,-0.052922,-0.172187,-0.230127,-0.12953,0.480118,-0.342663,-0.147303,CustomerId_1001,0,0
3,-0.061655,-0.091435,-0.150278,-0.168036,0.0,0.152675,-0.228416,0.071395,-0.099954,-0.01635,...,-0.052922,-0.172187,-0.230127,-0.12953,1.124405,-0.939458,-0.147303,CustomerId_1002,0,0
4,-0.055849,-0.073846,-0.201868,-0.111444,0.0,0.312703,-0.228416,-0.093264,-0.099954,-0.01635,...,-0.052922,-0.172187,-0.230127,-0.12953,0.18482,-0.069133,-0.147303,CustomerId_1003,0,0


In [17]:
processed_df.to_csv("../data/processed/proxy_target.csv", index=False)