# Customer Segmentation Analysis

This notebook demonstrates customer segmentation using RFM (Recency, Frequency, Monetary) analysis - a fundamental technique for understanding customer behavior and targeting marketing efforts.

## What You'll Learn
- Building RFM features from transaction data
- Scoring and segmenting customers
- Visualizing customer segments
- K-means clustering as an alternative approach
- Deriving actionable marketing strategies per segment

## Business Context
Not all customers are equal. Understanding which customers are most valuable, which are at risk, and which need different engagement strategies allows for more efficient resource allocation.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings

warnings.filterwarnings('ignore')

# Style configuration
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

print("Libraries loaded successfully!")

## 1. Data Loading & Preparation

In [None]:
# Load transaction data
df = pd.read_csv('../data/samples/revenue_sample.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Data preparation
# Find relevant columns
date_cols = [col for col in df.columns if 'date' in col.lower()]
amount_cols = [col for col in df.columns if 'amount' in col.lower() or 'revenue' in col.lower()]

if date_cols and amount_cols:
    df['transaction_date'] = pd.to_datetime(df[date_cols[0]])
    df['amount'] = pd.to_numeric(df[amount_cols[0]], errors='coerce')

# Create customer IDs if not present
if 'customer_id' not in df.columns:
    np.random.seed(42)
    n_customers = 150
    df['customer_id'] = [f'CUST_{np.random.randint(1, n_customers+1):04d}' for _ in range(len(df))]

# Ensure we have the required columns
if 'transaction_date' not in df.columns:
    np.random.seed(42)
    start_date = datetime(2024, 1, 1)
    df['transaction_date'] = [start_date + timedelta(days=np.random.randint(0, 365)) for _ in range(len(df))]

if 'amount' not in df.columns:
    np.random.seed(42)
    df['amount'] = np.random.exponential(scale=200, size=len(df))

print(f"\nUnique customers: {df['customer_id'].nunique()}")
print(f"Total transactions: {len(df)}")
print(f"Date range: {df['transaction_date'].min()} to {df['transaction_date'].max()}")

## 2. RFM Feature Engineering

**RFM Analysis** evaluates customers based on three metrics:
- **Recency (R)**: How recently did the customer make a purchase?
- **Frequency (F)**: How often do they purchase?
- **Monetary (M)**: How much do they spend?

In [None]:
# Set analysis date (typically today or max date in data)
analysis_date = df['transaction_date'].max() + timedelta(days=1)
print(f"Analysis date: {analysis_date}")

# Calculate RFM metrics for each customer
rfm = df.groupby('customer_id').agg({
    'transaction_date': lambda x: (analysis_date - x.max()).days,  # Recency
    'customer_id': 'count',  # Frequency (using customer_id as proxy for order count)
    'amount': 'sum'  # Monetary
}).rename(columns={
    'transaction_date': 'recency',
    'customer_id': 'frequency',
    'amount': 'monetary'
})

# Reset index
rfm = rfm.reset_index()

print(f"\nRFM Summary Statistics:")
print(rfm.describe().round(2))
rfm.head(10)

In [None]:
# Visualize RFM distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Recency distribution
ax1 = axes[0]
ax1.hist(rfm['recency'], bins=30, color='steelblue', edgecolor='navy', alpha=0.8)
ax1.axvline(x=rfm['recency'].median(), color='red', linestyle='--', 
            label=f'Median: {rfm["recency"].median():.0f} days')
ax1.set_xlabel('Days Since Last Purchase')
ax1.set_ylabel('Number of Customers')
ax1.set_title('Recency Distribution', fontweight='bold')
ax1.legend()

# Frequency distribution
ax2 = axes[1]
ax2.hist(rfm['frequency'], bins=30, color='forestgreen', edgecolor='darkgreen', alpha=0.8)
ax2.axvline(x=rfm['frequency'].median(), color='red', linestyle='--',
            label=f'Median: {rfm["frequency"].median():.0f} orders')
ax2.set_xlabel('Number of Orders')
ax2.set_ylabel('Number of Customers')
ax2.set_title('Frequency Distribution', fontweight='bold')
ax2.legend()

# Monetary distribution
ax3 = axes[2]
ax3.hist(rfm['monetary'], bins=30, color='coral', edgecolor='darkred', alpha=0.8)
ax3.axvline(x=rfm['monetary'].median(), color='red', linestyle='--',
            label=f'Median: ${rfm["monetary"].median():,.0f}')
ax3.set_xlabel('Total Spend ($)')
ax3.set_ylabel('Number of Customers')
ax3.set_title('Monetary Distribution', fontweight='bold')
ax3.legend()

plt.tight_layout()
plt.savefig('../docs/visualizations/rfm_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. RFM Scoring

We'll score each customer 1-5 on each metric, where 5 is the best.

In [None]:
# Create RFM scores using quintiles
# Note: For recency, lower is better, so we reverse the score
rfm['r_score'] = pd.qcut(rfm['recency'], q=5, labels=[5, 4, 3, 2, 1]).astype(int)
rfm['f_score'] = pd.qcut(rfm['frequency'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5]).astype(int)
rfm['m_score'] = pd.qcut(rfm['monetary'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5]).astype(int)

# Create RFM score string and total
rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)
rfm['rfm_total'] = rfm['r_score'] + rfm['f_score'] + rfm['m_score']

print("RFM Scores Summary:")
print(f"  R-Score range: {rfm['r_score'].min()} - {rfm['r_score'].max()}")
print(f"  F-Score range: {rfm['f_score'].min()} - {rfm['f_score'].max()}")
print(f"  M-Score range: {rfm['m_score'].min()} - {rfm['m_score'].max()}")
print(f"  Total Score range: {rfm['rfm_total'].min()} - {rfm['rfm_total'].max()}")

rfm.head(10)

## 4. Customer Segmentation

Based on RFM scores, we assign customers to meaningful business segments.

In [None]:
def assign_segment(row):
    """
    Assign customer segment based on RFM scores.
    
    Segments:
    - Champions: Best customers (high R, F, M)
    - Loyal Customers: High frequency, good monetary
    - Potential Loyalists: Recent, moderate frequency
    - New Customers: Very recent, low frequency
    - Promising: Recent, low F & M
    - Need Attention: Above average but slipping
    - About to Sleep: Below average, at risk
    - At Risk: High-value customers going dormant
    - Can't Lose: Were great, now inactive
    - Hibernating: Low across all metrics
    - Lost: Very low engagement
    """
    r, f, m = row['r_score'], row['f_score'], row['m_score']
    
    if r >= 4 and f >= 4 and m >= 4:
        return 'Champions'
    elif f >= 4 and m >= 3:
        return 'Loyal Customers'
    elif r >= 4 and f >= 2 and f <= 4:
        return 'Potential Loyalists'
    elif r >= 4 and f <= 2:
        return 'New Customers'
    elif r >= 3 and f <= 2 and m <= 2:
        return 'Promising'
    elif r >= 2 and r <= 3 and f >= 2 and f <= 3:
        return 'Need Attention'
    elif r <= 2 and f >= 2:
        return 'About to Sleep'
    elif r <= 2 and f >= 4 and m >= 4:
        return 'At Risk'
    elif r <= 2 and f >= 3 and m >= 4:
        return "Can't Lose"
    elif r <= 2 and f <= 2:
        return 'Hibernating'
    else:
        return 'Others'

rfm['segment'] = rfm.apply(assign_segment, axis=1)

# Segment distribution
segment_counts = rfm['segment'].value_counts()
print("Customer Segment Distribution:")
for segment, count in segment_counts.items():
    pct = count / len(rfm) * 100
    print(f"  {segment}: {count} ({pct:.1f}%)")

In [None]:
# Visualize segment distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart
ax1 = axes[0]
colors = plt.cm.Set3(np.linspace(0, 1, len(segment_counts)))
bars = ax1.barh(segment_counts.index, segment_counts.values, color=colors)
ax1.set_xlabel('Number of Customers')
ax1.set_title('Customer Segment Distribution', fontweight='bold')

# Add value labels
for bar, val in zip(bars, segment_counts.values):
    ax1.text(val + 1, bar.get_y() + bar.get_height()/2, 
             f'{val} ({val/len(rfm)*100:.1f}%)', va='center')

# Pie chart
ax2 = axes[1]
ax2.pie(segment_counts.values, labels=segment_counts.index, autopct='%1.1f%%',
        colors=colors, startangle=90)
ax2.set_title('Segment Share', fontweight='bold')

plt.tight_layout()
plt.savefig('../docs/visualizations/rfm_segments.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Segment Profile Analysis

In [None]:
# Calculate segment statistics
segment_stats = rfm.groupby('segment').agg({
    'customer_id': 'count',
    'recency': 'mean',
    'frequency': 'mean',
    'monetary': ['mean', 'sum']
}).round(2)

segment_stats.columns = ['customer_count', 'avg_recency', 'avg_frequency', 'avg_monetary', 'total_revenue']
segment_stats['revenue_share'] = (segment_stats['total_revenue'] / segment_stats['total_revenue'].sum() * 100).round(1)
segment_stats = segment_stats.sort_values('total_revenue', ascending=False)

print("Segment Profile Summary:")
print(segment_stats.to_string())

In [None]:
# Heatmap of segment characteristics
segment_rfm_avg = rfm.groupby('segment')[['r_score', 'f_score', 'm_score']].mean()
segment_rfm_avg = segment_rfm_avg.loc[segment_stats.index]  # Sort by revenue

fig, ax = plt.subplots(figsize=(10, 8))

sns.heatmap(segment_rfm_avg, annot=True, fmt='.2f', cmap='RdYlGn', 
            vmin=1, vmax=5, ax=ax, cbar_kws={'label': 'Average Score'})

ax.set_title('Average RFM Scores by Segment\n(Sorted by Revenue)', fontsize=14, fontweight='bold')
ax.set_xlabel('RFM Dimension')
ax.set_ylabel('Segment')

plt.tight_layout()
plt.show()

## 6. RFM 3D Visualization

In [None]:
# 2D scatter plots showing RFM relationships
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Color by segment
segments = rfm['segment'].unique()
colors = dict(zip(segments, plt.cm.tab10(np.linspace(0, 1, len(segments)))))

# R vs F
ax1 = axes[0]
for segment in segments:
    mask = rfm['segment'] == segment
    ax1.scatter(rfm.loc[mask, 'recency'], rfm.loc[mask, 'frequency'], 
                label=segment, alpha=0.6, s=rfm.loc[mask, 'monetary']/50)
ax1.set_xlabel('Recency (days)')
ax1.set_ylabel('Frequency (orders)')
ax1.set_title('Recency vs Frequency', fontweight='bold')

# F vs M
ax2 = axes[1]
for segment in segments:
    mask = rfm['segment'] == segment
    ax2.scatter(rfm.loc[mask, 'frequency'], rfm.loc[mask, 'monetary'], 
                label=segment, alpha=0.6)
ax2.set_xlabel('Frequency (orders)')
ax2.set_ylabel('Monetary ($)')
ax2.set_title('Frequency vs Monetary', fontweight='bold')

# R vs M
ax3 = axes[2]
for segment in segments:
    mask = rfm['segment'] == segment
    ax3.scatter(rfm.loc[mask, 'recency'], rfm.loc[mask, 'monetary'], 
                label=segment, alpha=0.6)
ax3.set_xlabel('Recency (days)')
ax3.set_ylabel('Monetary ($)')
ax3.set_title('Recency vs Monetary', fontweight='bold')
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)

plt.tight_layout()
plt.show()

## 7. K-Means Clustering (Alternative Approach)

In [None]:
# Prepare data for clustering
rfm_normalized = rfm[['recency', 'frequency', 'monetary']].copy()

# Log transform to handle skewness
rfm_normalized['recency'] = np.log1p(rfm_normalized['recency'])
rfm_normalized['frequency'] = np.log1p(rfm_normalized['frequency'])
rfm_normalized['monetary'] = np.log1p(rfm_normalized['monetary'])

# Standardize
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_normalized)

# Find optimal number of clusters using elbow method
inertias = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(rfm_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
ax.set_xlabel('Number of Clusters (K)')
ax.set_ylabel('Inertia (Within-cluster sum of squares)')
ax.set_title('Elbow Method for Optimal K', fontweight='bold')
ax.grid(True, alpha=0.3)

# Mark suggested K
ax.axvline(x=4, color='red', linestyle='--', label='Suggested K=4')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Apply K-means with optimal K
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
rfm['cluster'] = kmeans.fit_predict(rfm_scaled)

# Analyze clusters
cluster_stats = rfm.groupby('cluster').agg({
    'customer_id': 'count',
    'recency': 'mean',
    'frequency': 'mean',
    'monetary': 'mean',
    'rfm_total': 'mean'
}).round(2)

cluster_stats.columns = ['count', 'avg_recency', 'avg_frequency', 'avg_monetary', 'avg_rfm_score']

# Assign cluster names based on characteristics
cluster_names = {
    cluster_stats['avg_rfm_score'].idxmax(): 'High Value',
    cluster_stats['avg_rfm_score'].idxmin(): 'Low Value',
}
remaining = [c for c in cluster_stats.index if c not in cluster_names]
if len(remaining) >= 2:
    sorted_remaining = sorted(remaining, key=lambda x: cluster_stats.loc[x, 'avg_monetary'], reverse=True)
    cluster_names[sorted_remaining[0]] = 'Medium-High Value'
    cluster_names[sorted_remaining[1]] = 'Medium-Low Value'

cluster_stats['cluster_name'] = cluster_stats.index.map(cluster_names)

print("K-Means Cluster Analysis:")
print(cluster_stats.to_string())

In [None]:
# Visualize clusters
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Cluster scatter plot
ax1 = axes[0]
scatter = ax1.scatter(rfm['frequency'], rfm['monetary'], 
                      c=rfm['cluster'], cmap='viridis', alpha=0.6, s=50)
ax1.set_xlabel('Frequency')
ax1.set_ylabel('Monetary ($)')
ax1.set_title('K-Means Clusters (Frequency vs Monetary)', fontweight='bold')
plt.colorbar(scatter, ax=ax1, label='Cluster')

# Cluster comparison radar chart (simplified as bar)
ax2 = axes[1]
x = np.arange(len(cluster_stats))
width = 0.25

# Normalize metrics for comparison
metrics_norm = cluster_stats[['avg_recency', 'avg_frequency', 'avg_monetary']].copy()
metrics_norm = (metrics_norm - metrics_norm.min()) / (metrics_norm.max() - metrics_norm.min())
# Invert recency (lower is better)
metrics_norm['avg_recency'] = 1 - metrics_norm['avg_recency']

ax2.bar(x - width, metrics_norm['avg_recency'], width, label='Recency (inverted)', color='steelblue')
ax2.bar(x, metrics_norm['avg_frequency'], width, label='Frequency', color='forestgreen')
ax2.bar(x + width, metrics_norm['avg_monetary'], width, label='Monetary', color='coral')

ax2.set_xlabel('Cluster')
ax2.set_ylabel('Normalized Score')
ax2.set_title('Cluster Profiles (Normalized)', fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(cluster_stats['cluster_name'])
ax2.legend()

plt.tight_layout()
plt.show()

## 8. Segment-Specific Strategies

In [None]:
# Define marketing strategies for each segment
strategies = {
    'Champions': {
        'description': 'Best customers - recent, frequent, high spenders',
        'action': 'Loyalty programs, early access to new products, referral programs',
        'channel': 'VIP email, personal outreach',
        'priority': 'High - Retain and maximize'
    },
    'Loyal Customers': {
        'description': 'Consistent buyers, strong monetary value',
        'action': 'Upsell premium products, request reviews, referral incentives',
        'channel': 'Email, loyalty app notifications',
        'priority': 'High - Increase frequency'
    },
    'Potential Loyalists': {
        'description': 'Recent buyers with growth potential',
        'action': 'Onboarding sequences, product recommendations, membership offers',
        'channel': 'Email, retargeting ads',
        'priority': 'High - Convert to loyal'
    },
    'New Customers': {
        'description': 'Recently acquired, limited history',
        'action': 'Welcome series, first-purchase discounts, educational content',
        'channel': 'Email, social media',
        'priority': 'Medium - Nurture relationship'
    },
    'Promising': {
        'description': 'Recent but low engagement',
        'action': 'Limited-time offers, category exploration campaigns',
        'channel': 'Email, push notifications',
        'priority': 'Medium - Increase engagement'
    },
    'Need Attention': {
        'description': 'Above average but showing decline',
        'action': 'Reactivation campaigns, personalized offers, feedback surveys',
        'channel': 'Email, SMS',
        'priority': 'Medium - Prevent churn'
    },
    'About to Sleep': {
        'description': 'Below average, at risk of churning',
        'action': 'Win-back campaigns, special discounts, new product announcements',
        'channel': 'Email, retargeting',
        'priority': 'Medium - Re-engage'
    },
    'At Risk': {
        'description': 'High-value customers going dormant',
        'action': 'Urgent win-back, personal outreach, exclusive offers',
        'channel': 'Phone, personalized email',
        'priority': 'Critical - Immediate action'
    },
    "Can't Lose": {
        'description': 'Were champions, now inactive',
        'action': 'Aggressive win-back, understand why they left, premium incentives',
        'channel': 'Phone call, direct mail',
        'priority': 'Critical - High value at stake'
    },
    'Hibernating': {
        'description': 'Low engagement across all metrics',
        'action': 'Re-engagement email series, last chance offers',
        'channel': 'Email only (low cost)',
        'priority': 'Low - Consider sunsetting'
    },
    'Others': {
        'description': 'Mixed signals',
        'action': 'Standard marketing, monitor behavior',
        'channel': 'Email',
        'priority': 'Low'
    }
}

# Create strategy summary
print("="*70)
print("SEGMENT MARKETING STRATEGIES")
print("="*70)

for segment in segment_counts.index:
    if segment in strategies:
        count = segment_counts[segment]
        pct = count / len(rfm) * 100
        s = strategies[segment]
        
        print(f"\n{segment} ({count} customers, {pct:.1f}%)")
        print(f"  Description: {s['description']}")
        print(f"  Priority: {s['priority']}")
        print(f"  Action: {s['action']}")
        print(f"  Channel: {s['channel']}")

## 9. Key Insights & Recommendations

In [None]:
print("="*60)
print("CUSTOMER SEGMENTATION - KEY FINDINGS")
print("="*60)

# Top segments by revenue
print(f"\n1. REVENUE CONCENTRATION")
top_segments = segment_stats.head(3)
top_revenue_share = top_segments['revenue_share'].sum()
print(f"   Top 3 segments generate {top_revenue_share:.1f}% of revenue")
for idx, row in top_segments.iterrows():
    print(f"   - {idx}: ${row['total_revenue']:,.0f} ({row['revenue_share']:.1f}%)")

# At-risk value
at_risk_segments = ['At Risk', "Can't Lose", 'About to Sleep']
at_risk_customers = rfm[rfm['segment'].isin(at_risk_segments)]
at_risk_value = at_risk_customers['monetary'].sum()
print(f"\n2. AT-RISK REVENUE")
print(f"   {len(at_risk_customers)} customers at risk")
print(f"   ${at_risk_value:,.0f} in historical value at stake")

# Growth opportunity
growth_segments = ['Potential Loyalists', 'Promising', 'New Customers']
growth_customers = rfm[rfm['segment'].isin(growth_segments)]
print(f"\n3. GROWTH OPPORTUNITY")
print(f"   {len(growth_customers)} customers with growth potential")
print(f"   Focus on converting to Loyal/Champions")

# Pareto principle
rfm_sorted = rfm.sort_values('monetary', ascending=False)
top_20_pct = int(len(rfm) * 0.2)
top_20_revenue = rfm_sorted.head(top_20_pct)['monetary'].sum()
total_revenue = rfm['monetary'].sum()
print(f"\n4. PARETO ANALYSIS")
print(f"   Top 20% of customers = {top_20_revenue/total_revenue*100:.1f}% of revenue")

print(f"\n5. ACTION PRIORITIES")
print(f"   1. CRITICAL: Re-engage 'At Risk' and 'Can't Lose' segments")
print(f"   2. HIGH: Nurture 'Potential Loyalists' to become 'Champions'")
print(f"   3. MEDIUM: Improve 'New Customers' onboarding")
print(f"   4. LOW: Consider sunsetting 'Hibernating' segment")

print("\n" + "="*60)

In [None]:
# Export segmented customer data
export_df = rfm[['customer_id', 'recency', 'frequency', 'monetary', 
                  'r_score', 'f_score', 'm_score', 'rfm_score', 'segment']]

export_df.to_csv('../data/samples/customer_segments.csv', index=False)
print("Customer segments exported to data/samples/customer_segments.csv")

# Quick summary
export_df.head(10)