In [None]:

# RFM Proxy Target Engineering with Visualization

import sys
sys.path.append("../src")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from data_processing import DataLoader
from rfm_labeling import RFMLabeler

# 1. Load raw transaction data
df = DataLoader("../data/raw/data.csv").load_data()
print(df.head())

# 2. Compute and label RFM high-risk customers
rfm_labeler = RFMLabeler(
    customer_id_col='CustomerId',
    date_col='TransactionStartTime',
    amount_col='Amount',
    n_clusters=3,
    random_state=42
)
rfm_with_labels = rfm_labeler.fit_predict(df)
rfm_with_labels = rfm_with_labels.reset_index()
print(rfm_with_labels.head())

# 3. Visualize RFM distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(rfm_with_labels['Recency'], kde=True, ax=axes[0], color='tab:blue')
axes[0].set_title('Recency Distribution')
sns.histplot(rfm_with_labels['Frequency'], kde=True, ax=axes[1], color='tab:orange')
axes[1].set_title('Frequency Distribution')
sns.histplot(rfm_with_labels['Monetary'], kde=True, ax=axes[2], color='tab:green')
axes[2].set_title('Monetary Distribution')
plt.suptitle("Histograms of RFM Features", fontsize=16)
plt.tight_layout()
plt.show()

# 4. Plot RFM Clusters (2D slice grid and 3D)
palette = sns.color_palette("Set2", rfm_with_labels['cluster'].nunique())
sns.pairplot(
    rfm_with_labels,
    vars=['Recency', 'Frequency', 'Monetary'],
    hue='cluster',
    palette=palette,
    plot_kws={'alpha':0.7}
)
plt.suptitle("RFM Clusters by K-Means", fontsize=16, y=1.03)
plt.show()

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(
    rfm_with_labels['Recency'],
    rfm_with_labels['Frequency'],
    rfm_with_labels['Monetary'],
    c=rfm_with_labels['cluster'],
    cmap='Set2',
    s=60, alpha=0.8
)
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')
plt.title("RFM Customers Clustered (3D View)")
plt.legend(*sc.legend_elements(), title="Cluster")
plt.tight_layout()
plt.show()

# 5. Optional: UMAP Visualization (for large datasets; requires umap-learn)
try:
    import umap
    reducer = umap.UMAP(random_state=42)
    rfm_features = rfm_with_labels[['Recency', 'Frequency', 'Monetary']].values
    embedding = reducer.fit_transform(rfm_features)
    plt.figure(figsize=(8,6))
    plt.scatter(embedding[:,0], embedding[:,1], c=rfm_with_labels['cluster'], cmap='Set2', alpha=0.6)
    plt.title("UMAP Projection of RFM Clusters")
    plt.xlabel("UMAP-1")
    plt.ylabel("UMAP-2")
    plt.colorbar(ticks=[0,1,2])
    plt.show()
except ImportError:
    print("UMAP not installed. Skipping UMAP visualization. To enable, run: pip install umap-learn")

# 6. Cluster summary statistics
summary = (rfm_with_labels
           .groupby('cluster')
           .agg({
               'Recency': 'mean',
               'Frequency': 'mean',
               'Monetary': 'mean',
               'is_high_risk': 'mean',
               'CustomerId': 'count'
           })
           .rename(columns={'CustomerId': 'n_customers', 'is_high_risk': 'pct_high_risk'}))
summary['pct_high_risk'] = summary['pct_high_risk'] * 100
display(summary)

print("Cluster with highest average Recency (least recent activity) is labeled as high risk.")

# 7. High risk customer ratio and feature summary
high_risk_count = rfm_with_labels['is_high_risk'].sum()
total = len(rfm_with_labels)
print(f"\nHigh risk proxy label assigned to {high_risk_count} out of {total} customers "
      f"({high_risk_count / total:.1%})")

print("\nFeature summary of high-risk group:")
display(
    rfm_with_labels[rfm_with_labels['is_high_risk']==1][['Recency','Frequency','Monetary']].describe().T
)
print("\nFeature summary of remaining (not high-risk) customers:")
display(
    rfm_with_labels[rfm_with_labels['is_high_risk']==0][['Recency','Frequency','Monetary']].describe().T
)

# 8. Save is_high_risk labels for integration
rfm_with_labels[['CustomerId', 'is_high_risk']].to_csv(
    "../data/processed/rfm_labels.csv", index=False
)


In [None]:
from ydata_profiling import ProfileReport

# Automated profiling report (full interactive HTML)
profile = ProfileReport(
    rfm_with_labels,
    title="RFM Customer Segmentation & High Risk Labeling Report",
    explorative=True,
    correlations={"pearson": True, "spearman": True, "kendall": True, "phi_k": False, "cramers": False},
    sortby="is_high_risk"
)
profile.to_file("../reports/rfm_risk_profile_report.html")
print("RFM report generated: ../reports/rfm_risk_profile_report.html")