# Customer Segmentation Analysis

This notebook performs comprehensive customer segmentation using:
- **RFM Analysis**: Recency, Frequency, Monetary value segmentation
- **K-Means Clustering**: Machine learning-based customer grouping
- **Behavioral Analysis**: Purchase pattern identification

## Business Objective
Identify distinct customer segments to personalize marketing strategies and improve customer lifetime value.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine learning libraries
# Note: Install these in your local environment:
# pip install scikit-learn
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Generate sample customer transaction data
def generate_customer_data():
    """
    Generate synthetic customer transaction data
    """
    np.random.seed(42)
    n_customers = 1000
    n_transactions = 15000
    
    # Customer segments with different behaviors
    segments = {
        'High_Value': {'n': 200, 'freq_range': (10, 50), 'amount_range': (100, 500)},
        'Regular': {'n': 500, 'freq_range': (3, 15), 'amount_range': (25, 150)},
        'Occasional': {'n': 300, 'freq_range': (1, 5), 'amount_range': (10, 80)}
    }
    
    transactions = []
    customer_id = 1
    
    for segment, params in segments.items():
        for _ in range(params['n']):
            # Number of transactions for this customer
            n_trans = np.random.randint(params['freq_range'][0], params['freq_range'][1])
            
            # Generate transaction dates (last 2 years)
            end_date = datetime.now()
            start_date = end_date - timedelta(days=730)
            
            trans_dates = pd.date_range(start=start_date, end=end_date, periods=n_trans)
            
            for trans_date in trans_dates:
                amount = np.random.uniform(params['amount_range'][0], params['amount_range'][1])
                
                transactions.append({
                    'customer_id': customer_id,
                    'transaction_date': trans_date,
                    'amount': round(amount, 2),
                    'true_segment': segment
                })
            
            customer_id += 1
    
    return pd.DataFrame(transactions)

# Generate data
transactions = generate_customer_data()
print(f"Generated {len(transactions)} transactions for {transactions['customer_id'].nunique()} customers")
transactions.head()

## RFM Analysis

In [None]:
# Calculate RFM metrics
def calculate_rfm(df, customer_col='customer_id', date_col='transaction_date', amount_col='amount'):
    """
    Calculate Recency, Frequency, and Monetary values for each customer
    """
    current_date = df[date_col].max() + timedelta(days=1)
    
    rfm = df.groupby(customer_col).agg({
        date_col: lambda x: (current_date - x.max()).days,  # Recency
        amount_col: ['count', 'sum']  # Frequency and Monetary
    }).round(2)
    
    rfm.columns = ['Recency', 'Frequency', 'Monetary']
    rfm = rfm.reset_index()
    
    return rfm

# Calculate RFM
rfm_data = calculate_rfm(transactions)

# RFM Scoring (1-5 scale)
rfm_data['R_Score'] = pd.qcut(rfm_data['Recency'], 5, labels=[5,4,3,2,1])
rfm_data['F_Score'] = pd.qcut(rfm_data['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5])
rfm_data['M_Score'] = pd.qcut(rfm_data['Monetary'], 5, labels=[1,2,3,4,5])

# Combine RFM scores
rfm_data['RFM_Score'] = rfm_data['R_Score'].astype(str) + rfm_data['F_Score'].astype(str) + rfm_data['M_Score'].astype(str)

print("RFM Analysis Results:")
print(rfm_data.describe())

# RFM Segmentation
def segment_customers(row):
    """
    Segment customers based on RFM scores
    """
    r, f, m = int(row['R_Score']), int(row['F_Score']), int(row['M_Score'])
    
    if r >= 4 and f >= 4 and m >= 4:
        return 'Champions'
    elif r >= 3 and f >= 3 and m >= 3:
        return 'Loyal Customers'
    elif r >= 4 and f <= 2:
        return 'New Customers'
    elif r >= 3 and f >= 3 and m <= 2:
        return 'Potential Loyalists'
    elif r <= 2 and f >= 3:
        return 'At Risk'
    elif r <= 2 and f <= 2:
        return 'Lost Customers'
    else:
        return 'Others'

rfm_data['Segment'] = rfm_data.apply(segment_customers, axis=1)

# Segment analysis
segment_summary = rfm_data.groupby('Segment').agg({
    'customer_id': 'count',
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).round(2)

segment_summary.columns = ['Count', 'Avg_Recency', 'Avg_Frequency', 'Avg_Monetary']
print("\nCustomer Segment Summary:")
print(segment_summary)

## K-Means Clustering

In [None]:
# Prepare data for clustering
cluster_features = rfm_data[['Recency', 'Frequency', 'Monetary']].copy()

# Scale the features
scaler = StandardScaler()
cluster_features_scaled = scaler.fit_transform(cluster_features)

# Find optimal number of clusters using elbow method
inertias = []
silhouette_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(cluster_features_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(cluster_features_scaled, kmeans.labels_))

# Plot elbow curve and silhouette scores
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(k_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method for Optimal k')
ax1.grid(True)

ax2.plot(k_range, silhouette_scores, 'ro-')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score for Different k')
ax2.grid(True)

plt.tight_layout()
plt.show()

# Use optimal number of clusters (let's say k=5)
optimal_k = 5
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(cluster_features_scaled)

# Add cluster labels to RFM data
rfm_data['Cluster'] = cluster_labels

# Analyze clusters
cluster_summary = rfm_data.groupby('Cluster').agg({
    'customer_id': 'count',
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).round(2)

cluster_summary.columns = ['Count', 'Avg_Recency', 'Avg_Frequency', 'Avg_Monetary']
print(f"\nK-Means Clustering Results (k={optimal_k}):")
print(cluster_summary)

In [None]:
# Visualize clusters
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# RFM distribution by cluster
for i, metric in enumerate(['Recency', 'Frequency', 'Monetary']):
    sns.boxplot(data=rfm_data, x='Cluster', y=metric, ax=axes[0, i])
    axes[0, i].set_title(f'{metric} by Cluster')

# 2D cluster visualization using PCA
pca = PCA(n_components=2)
cluster_features_pca = pca.fit_transform(cluster_features_scaled)

scatter = axes[1, 0].scatter(cluster_features_pca[:, 0], cluster_features_pca[:, 1], 
                           c=cluster_labels, cmap='viridis', alpha=0.6)
axes[1, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
axes[1, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
axes[1, 0].set_title('Customer Clusters (PCA Visualization)')
plt.colorbar(scatter, ax=axes[1, 0])

# Cluster size distribution
cluster_counts = rfm_data['Cluster'].value_counts().sort_index()
axes[1, 1].pie(cluster_counts.values, labels=[f'Cluster {i}' for i in cluster_counts.index], 
              autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Customer Distribution by Cluster')

# Average monetary value by cluster
avg_monetary = rfm_data.groupby('Cluster')['Monetary'].mean()
axes[1, 2].bar(range(len(avg_monetary)), avg_monetary.values)
axes[1, 2].set_xlabel('Cluster')
axes[1, 2].set_ylabel('Average Monetary Value ($)')
axes[1, 2].set_title('Average Monetary Value by Cluster')
axes[1, 2].set_xticks(range(len(avg_monetary)))

plt.tight_layout()
plt.show()

## Cluster Profiling and Business Insights

In [None]:
# Create detailed cluster profiles
def profile_clusters(rfm_df):
    """
    Create detailed profiles for each cluster
    """
    profiles = {}
    
    for cluster in sorted(rfm_df['Cluster'].unique()):
        cluster_data = rfm_df[rfm_df['Cluster'] == cluster]
        
        profile = {
            'Size': len(cluster_data),
            'Percentage': f"{len(cluster_data)/len(rfm_df)*100:.1f}%",
            'Avg_Recency': f"{cluster_data['Recency'].mean():.1f} days",
            'Avg_Frequency': f"{cluster_data['Frequency'].mean():.1f} orders",
            'Avg_Monetary': f"${cluster_data['Monetary'].mean():.2f}",
            'Total_Revenue': f"${cluster_data['Monetary'].sum():.2f}",
            'Revenue_Percentage': f"{cluster_data['Monetary'].sum()/rfm_df['Monetary'].sum()*100:.1f}%"
        }
        
        profiles[f'Cluster_{cluster}'] = profile
    
    return pd.DataFrame(profiles).T

# Generate cluster profiles
cluster_profiles = profile_clusters(rfm_data)
print("Detailed Cluster Profiles:")
print(cluster_profiles)

# Define business-friendly cluster names
cluster_names = {
    0: 'Price Conscious',
    1: 'Loyal Advocates', 
    2: 'High Value',
    3: 'Regular Shoppers',
    4: 'At Risk'
}

rfm_data['Cluster_Name'] = rfm_data['Cluster'].map(cluster_names)

print("\nBusiness-Friendly Segment Names:")
for cluster, name in cluster_names.items():
    count = len(rfm_data[rfm_data['Cluster'] == cluster])
    revenue = rfm_data[rfm_data['Cluster'] == cluster]['Monetary'].sum()
    print(f"Cluster {cluster} - {name}: {count} customers, ${revenue:,.2f} total revenue")

## Marketing Recommendations by Segment

In [None]:
# Generate marketing recommendations
marketing_strategies = {
    'High Value': {
        'Strategy': 'VIP Treatment',
        'Tactics': ['Exclusive early access', 'Premium support', 'Personalized recommendations'],
        'Budget_Allocation': '30%'
    },
    'Loyal Advocates': {
        'Strategy': 'Referral Programs',
        'Tactics': ['Referral incentives', 'Brand ambassador programs', 'Social media features'],
        'Budget_Allocation': '25%'
    },
    'Regular Shoppers': {
        'Strategy': 'Upselling & Cross-selling',
        'Tactics': ['Product bundles', 'Loyalty points', 'Targeted promotions'],
        'Budget_Allocation': '20%'
    },
    'Price Conscious': {
        'Strategy': 'Value Communication',
        'Tactics': ['Discount campaigns', 'Value messaging', 'Bulk purchase offers'],
        'Budget_Allocation': '15%'
    },
    'At Risk': {
        'Strategy': 'Win-Back Campaigns',
        'Tactics': ['Re-engagement emails', 'Special discounts', 'Feedback surveys'],
        'Budget_Allocation': '10%'
    }
}

print("Marketing Strategy Recommendations by Segment:")
print("=" * 50)

for segment, strategy in marketing_strategies.items():
    print(f"\n{segment.upper()}:")
    print(f"Strategy: {strategy['Strategy']}")
    print(f"Budget Allocation: {strategy['Budget_Allocation']}")
    print(f"Tactics: {', '.join(strategy['Tactics'])}")

# Customer Lifetime Value estimation by segment
clv_analysis = rfm_data.groupby('Cluster_Name').agg({
    'Frequency': 'mean',
    'Monetary': 'mean',
    'Recency': 'mean'
})

# Simple CLV calculation (frequency * monetary * estimated lifetime)
clv_analysis['Estimated_CLV'] = (clv_analysis['Frequency'] * clv_analysis['Monetary'] * 
                                (365 / clv_analysis['Recency'])).round(2)

print("\n\nEstimated Customer Lifetime Value by Segment:")
print(clv_analysis.sort_values('Estimated_CLV', ascending=False))

## Export Results for Dashboard

In [None]:
# Prepare data for dashboard export
dashboard_data = rfm_data[['customer_id', 'Recency', 'Frequency', 'Monetary', 
                          'RFM_Score', 'Segment', 'Cluster', 'Cluster_Name']].copy()

# Export to CSV for dashboard
dashboard_data.to_csv('../dashboards/customer_segments.csv', index=False)

# Create segment summary for dashboard
segment_summary.to_csv('../dashboards/segment_summary.csv')

print("Data exported to dashboards folder for visualization:")
print("- customer_segments.csv: Individual customer segment data")
print("- segment_summary.csv: Aggregated segment metrics")

# Final insights
print("\n" + "="*60)
print("KEY BUSINESS INSIGHTS")
print("="*60)
print(f"• Total Customers Analyzed: {len(rfm_data):,}")
print(f"• Total Revenue: ${rfm_data['Monetary'].sum():,.2f}")
print(f"• Top Revenue Segment: {cluster_summary.loc[cluster_summary['Avg_Monetary'].idxmax(), 'Count']} customers")
print(f"• Customer Retention Opportunity: Focus on {len(rfm_data[rfm_data['Cluster_Name'] == 'At Risk'])} at-risk customers")