In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create a function for RFM calculation
def calculate_rfm(merged):
    # Recency: Days since last purchase
    recency = merged.groupby('customer_id')['order_purchase_timestamp'].max().reset_index()
    recency['recency'] = (pd.Timestamp.now() - recency['order_purchase_timestamp']).dt.days
    
    # Frequency: Total orders per customer
    frequency = merged.groupby('customer_id')['order_id'].nunique().reset_index()
    frequency.rename(columns={'order_id': 'frequency'}, inplace=True)
    
    # Monetary: Total spending per customer
    monetary = merged.groupby('customer_id')['price'].sum().reset_index()
    monetary.rename(columns={'price': 'monetary'}, inplace=True)
    
    # Merge RFM
    rfm = pd.merge(recency, frequency, on='customer_id')
    rfm = pd.merge(rfm, monetary, on='customer_id')
    
    return rfm

# Function to handle segmentation with error handling
def segment_customers(rfm):
    # Assign scores (1-5, 5=best) with error handling
    try:
        rfm['recency_score'] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])
    except ValueError:
        # Alternative: use pd.cut with custom bins if data doesn't have enough variation
        recency_bins = [0, rfm['recency'].quantile(0.2), rfm['recency'].quantile(0.4),
                      rfm['recency'].quantile(0.6), rfm['recency'].quantile(0.8), float('inf')]
        rfm['recency_score'] = pd.cut(rfm['recency'], bins=recency_bins, labels=[5, 4, 3, 2, 1], include_lowest=True)
    
    try:
        rfm['frequency_score'] = pd.qcut(rfm['frequency'], 5, labels=[1, 2, 3, 4, 5])
    except ValueError:
        # If all frequency values are the same or very little variation
        if rfm['frequency'].nunique() == 1:
            # If all values are the same
            rfm['frequency_score'] = 3  # Middle value
        else:
            # Create manual bins
            freq_bins = [0, 1, 2, 3, 5, float('inf')]
            rfm['frequency_score'] = pd.cut(rfm['frequency'], bins=freq_bins, labels=[1, 2, 3, 4, 5], include_lowest=True)
    
    try:
        rfm['monetary_score'] = pd.qcut(rfm['monetary'], 5, labels=[1, 2, 3, 4, 5])
    except ValueError:
        # Alternative: use pd.cut with custom bins
        monetary_bins = [0, rfm['monetary'].quantile(0.2), rfm['monetary'].quantile(0.4),
                       rfm['monetary'].quantile(0.6), rfm['monetary'].quantile(0.8), float('inf')]
        rfm['monetary_score'] = pd.cut(rfm['monetary'], bins=monetary_bins, labels=[1, 2, 3, 4, 5], include_lowest=True)
    
    # Combine scores
    rfm['rfm_score'] = rfm['recency_score'].astype(str) + rfm['frequency_score'].astype(str) + rfm['monetary_score'].astype(str)
    
    # Define segments
    segment_map = {
        r'[4-5][4-5][4-5]': 'Champions',
        r'[3-5][3-5][3-5]': 'Loyal Customers',
        r'[2-3][2-3][2-3]': 'Potential Loyalists',
        r'[1-2][1-2][1-2]': 'At Risk'
    }
    rfm['segment'] = rfm['rfm_score'].replace(segment_map, regex=True)
    
    return rfm

# Load your merged data (uncomment and adjust path as needed)
merged = pd.read_parquet(r"E:\OneDrive\Desktop\Customer Retention Analysis\data\processed\merged_data.parquet")
# Convert timestamp to datetime if needed
merged['order_purchase_timestamp'] = pd.to_datetime(merged['order_purchase_timestamp'])

# Calculate RFM metrics
rfm = calculate_rfm(merged)

# Apply segmentation
rfm = segment_customers(rfm)

# Display RFM results
print(rfm.head())

# Analyze segment distribution
segment_counts = rfm['segment'].value_counts()
print(segment_counts)

# Save the results
rfm.to_csv(r"E:\OneDrive\Desktop\Customer Retention Analysis\data\processed\rfm_data.csv", index=False)

                        customer_id order_purchase_timestamp  recency  \
0  00012a2ce6f8dcda20d059ce98491703      2017-11-14 16:08:26     2670   
1  000161a058600d5901f007fab4c27140      2017-07-16 09:40:32     2791   
2  0001fd6190edaaf884bcaf3d49edf079      2017-02-28 11:06:43     2929   
3  0002414f95344307404f0ace7a26f1d5      2017-08-16 13:09:20     2760   
4  000379cdec625522490c315e70c7a9fb      2018-04-02 13:42:17     2531   

   frequency  monetary recency_score  frequency_score monetary_score  \
0          1     89.80             2                3              3   
1          1     54.90             1                3              2   
2          1    179.99             1                3              5   
3          1    149.90             2                3              4   
4          1     93.00             4                3              3   

  rfm_score              segment  
0       233  Potential Loyalists  
1       132                  132  
2       135            