In [None]:
# Proxy Target Variable Engineering â€” RFM & Clustering


import sys
sys.path.append("../src")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from data_processing import DataLoader
from proxy_target import ProxyTargetEngineer

# 1. Load data
df = DataLoader("../data/raw/data.csv").load_data()
print("Dataset shape:", df.shape)
df.head()





# 2. Calculate and label RFM proxy targets
engineer = ProxyTargetEngineer(
    customer_id_col='CustomerId',
    date_col='TransactionStartTime',
    amount_col='Amount',
    n_clusters=3,
    random_state=42,
    scale_rfm=True
)
rfm_with_clusters = engineer.fit_transform(df)
rfm_with_clusters = rfm_with_clusters.reset_index()
rfm_with_clusters.head()




## ðŸ“Š **Visualization: RFM Feature Distributions and Clusters**


# 3. Visualize Recency, Frequency, Monetary distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(rfm_with_clusters['Recency'], kde=True, ax=axes[0], color='tab:blue')
axes[0].set_title('Recency Distribution')
sns.histplot(rfm_with_clusters['Frequency'], kde=True, ax=axes[1], color='tab:orange')
axes[1].set_title('Frequency Distribution')
sns.histplot(rfm_with_clusters['Monetary'], kde=True, ax=axes[2], color='tab:green')
axes[2].set_title('Monetary Distribution')
plt.suptitle("RFM Histograms", fontsize=16)
plt.tight_layout()
plt.show()



# 4. Visualize RFM clustering (pairplot and 3D scatter)
palette = sns.color_palette("Set2", rfm_with_clusters['cluster'].nunique())
sns.pairplot(
    rfm_with_clusters,
    vars=['Recency', 'Frequency', 'Monetary'],
    hue='cluster',
    palette=palette,
    plot_kws={'alpha':0.6}
)
plt.suptitle("RFM K-Means Clusters", y=1.02, fontsize=15)
plt.show()

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(
    rfm_with_clusters['Recency'], rfm_with_clusters['Frequency'], rfm_with_clusters['Monetary'],
    c=rfm_with_clusters['cluster'], cmap='Set2', s=60, alpha=0.7
)
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')
plt.title("3D RFM Cluster View")
plt.tight_layout()
plt.show()



# 5. Cluster centers and summary
print("K-Means cluster centers (scaled):")
display(engineer.cluster_centers_)

summary = rfm_with_clusters.groupby('cluster').agg({
    'Recency': ['mean','median'],
    'Frequency': ['mean','median'],
    'Monetary': ['mean','median'],
    'is_high_risk':'mean',
    'CustomerId':'count'
})
summary.columns = ['_'.join(map(str,col)) for col in summary.columns]
summary = summary.rename(columns={'CustomerId_count':'n_customers'})
display(summary)



# 6. Compare high-risk/other groups
high_risk = rfm_with_clusters['is_high_risk'] == 1
print(f"High risk group: {high_risk.sum()} out of {len(rfm_with_clusters)} ({high_risk.mean():.1%}) customers.")

for label, group in [('High-Risk', True), ('Not High-Risk', False)]:
    desc = rfm_with_clusters[rfm_with_clusters['is_high_risk']==int(group)][['Recency','Frequency','Monetary']].describe().T
    print(f"\n{label} Customers' RFM Feature Summary:")
    display(desc)




## ðŸ“‘ **Distribution of Proxy Label**


sns.countplot(data=rfm_with_clusters, x="is_high_risk", palette='Set2')
plt.title("Count of Customers by Proxy High-Risk Label")
plt.xlabel("Is High Risk?")
plt.ylabel("Number of Customers")
plt.xticks([0, 1], ["No", "Yes"])
plt.tight_layout()
plt.show()





# 7. Save labels for downstream ML
rfm_with_clusters[['CustomerId','is_high_risk']].to_csv("../data/processed/rfm_labels.csv", index=False)
print("Saved: ../data/processed/rfm_labels.csv")