In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

In [None]:
# === PHASE 1: Data Loading & Cleaning ===
df = pd.read_csv('online_retail.csv', encoding='ISO-8859-1')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")


In [None]:
# Display first 5 rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Check column names and data types
# Check if InvoiceDate showing as object or datetime
print("Column Information:")
print(df.info())

In [None]:
# Check for missing values
print("\nMissing Values Count:")
print(df.isnull().sum())
print("\n% of Missing Values:")
print((df.isnull().sum() / len(df)) * 100)

In [None]:
# Basic statistics
print("Basic Statistics:")
df.describe()

In [None]:
# Create a copy for cleaning (always preserve original)
df_clean = df.copy()

print(f"Starting with {len(df_clean)} rows")

# Remove rows with missing CustomerID
df_clean = df_clean[df_clean['CustomerID'].notna()]
print(f"After removing missing CustomerID: {len(df_clean)} rows")

# Remove negative quantities (returns/cancellations)
df_clean = df_clean[df_clean['Quantity'] > 0]
print(f"After removing negative quantities: {len(df_clean)} rows")

# Step 3: Remove negative prices (data errors)
df_clean = df_clean[df_clean['UnitPrice'] > 0]
print(f"After removing negative prices: {len(df_clean)} rows")

print(f"\nFinal clean dataset: {len(df_clean)} rows")
print(f"Removed {len(df) - len(df_clean)} rows ({((len(df) - len(df_clean))/len(df)*100):.1f}%)")

In [None]:
# Create TotalPrice column (this is the actual revenue per line item)
df_clean['TotalPrice'] = df_clean['Quantity'] * df_clean['UnitPrice']

print("TotalPrice column created!")
print("\nSample of TotalPrice calculation:")
print(df_clean[['Quantity', 'UnitPrice', 'TotalPrice']].head(10))

In [None]:
# Convert InvoiceDate to datetime format
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

print("InvoiceDate converted to datetime!")
print(f"Date range: {df_clean['InvoiceDate'].min()} to {df_clean['InvoiceDate'].max()}")

In [None]:
# Final check - make sure everything looks good
print("=== CLEANED DATASET SUMMARY ===")
print(f"\nTotal rows: {len(df_clean)}")
print(f"Total columns: {len(df_clean.columns)}")
print(f"\nUnique customers: {df_clean['CustomerID'].nunique()}")
print(f"Unique invoices: {df_clean['InvoiceNo'].nunique()}")
print(f"Date range: {df_clean['InvoiceDate'].min().date()} to {df_clean['InvoiceDate'].max().date()}")
print(f"\nTotal revenue: ${df_clean['TotalPrice'].sum():,.2f}")
print(f"Average order value: ${df_clean.groupby('InvoiceNo')['TotalPrice'].sum().mean():,.2f}")

print("\n Phase 1 Complete - Data is clean and ready for RFM analysis!")

In [None]:
# === PHASE 2: RFM Calculation ===
# Define Analysis Date

# Find the most recent transaction date in our dataset
max_date = df_clean['InvoiceDate'].max()
print(f"Most recent transaction in dataset: {max_date}")

# Set analysis date as 1 day after the most recent transaction
# This becomes our "today" for calculating recency
analysis_date = max_date + pd.Timedelta(days=1)
print(f"Analysis date (our 'today'): {analysis_date}")
print(f"Analysis date (simplified): {analysis_date.date()}")

print("\n Analysis date set - ready to calculate Recency!")

In [None]:
# === Calculate RFM Values ===

print("Calculating RFM metrics for each customer...\n")

# Group all transactions by customer and calculate behavioral metrics
rfm = df_clean.groupby('CustomerID').agg({
    # Recency: Days between their LAST purchase and our analysis date
    'InvoiceDate': lambda x: (analysis_date - x.max()).days,

    # Frequency: COUNT of unique invoices (not total items, but number of orders)
    'InvoiceNo': 'nunique',  
    
    # Monetary: SUM of all their spending across all transactions
    'TotalPrice': 'sum'                                       
})

# Rename columns to be clear
rfm.columns = ['Recency', 'Frequency', 'Monetary']

# Convert index to column for easier manipulation
rfm = rfm.reset_index()

print(f"RFM calculated for {len(rfm)} customers")
print(f"\nFirst 5 customers:")
print(rfm.head())

print(f"\nBasic statistics of RFM values:")
print(rfm.describe())

print("\n‚úÖ Raw RFM values calculated!")

In [None]:
# === Assign RFM Scores (1-5 scale using quintiles) ===

print("Assigning RFM scores (1-5 scale based on quintiles)...\n")

# For Recency: Lower is better, so we REVERSE the labels
# 1 day ago = Score 5 (best), 300 days ago = Score 1 (worst)
try:
    rfm['R_Score'] = pd.qcut(rfm['Recency'], q=5, labels=[5, 4, 3, 2, 1])
except ValueError:
    # If exact quintiles don't work, use rank-based scoring instead
    rfm['R_Score'] = pd.qcut(rfm['Recency'].rank(method='first'), q=5, labels=[5, 4, 3, 2, 1])

# For Frequency: Higher is better, normal labels
# 100 purchases = Score 5 (best), 1 purchase = Score 1 (worst)
try:
    rfm['F_Score'] = pd.qcut(rfm['Frequency'], q=5, labels=[1, 2, 3, 4, 5])
except ValueError:
    # If exact quintiles don't work, use rank-based scoring instead
    rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5])

# For Monetary: Higher is better, normal labels
# $10,000 spent = Score 5 (best), $10 spent = Score 1 (worst)
try:
    rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=5, labels=[1, 2, 3, 4, 5])
except ValueError:
    # If exact quintiles don't work, use rank-based scoring instead
    rfm['M_Score'] = pd.qcut(rfm['Monetary'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5])

# Create RFM_Score as concatenated string (e.g., "555" = best customer)
rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)

print(f"RFM Scores assigned!")
print(f"\nSample of customers with scores:")
print(rfm[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'R_Score', 'F_Score', 'M_Score', 'RFM_Score']].head(10))

print(f"\nDistribution of R_Score:")
print(rfm['R_Score'].value_counts().sort_index())

print(f"\nDistribution of F_Score:")
print(rfm['F_Score'].value_counts().sort_index())

print(f"\nDistribution of M_Score:")
print(rfm['M_Score'].value_counts().sort_index())

print("\n‚úÖ RFM Scoring complete!")

In [None]:
# === Validation and Exploration ===

print("=== RFM CALCULATION SUMMARY ===\n")

# Basic counts
print(f"Total customers analyzed: {len(rfm)}")
print(f"Date range of analysis: {df_clean['InvoiceDate'].min().date()} to {df_clean['InvoiceDate'].max().date()}")

# RFM value ranges
print(f"\nRecency range: {rfm['Recency'].min()} to {rfm['Recency'].max()} days")
print(f"Frequency range: {rfm['Frequency'].min()} to {rfm['Frequency'].max()} purchases")
print(f"Monetary range: ${rfm['Monetary'].min():.2f} to ${rfm['Monetary'].max():.2f}")

# Find best customers (RFM = 555)
best_customers = rfm[rfm['RFM_Score'] == '555']
print(f"\nüèÜ Best customers (555 score): {len(best_customers)}")
if len(best_customers) > 0:
    print(best_customers[['CustomerID', 'Recency', 'Frequency', 'Monetary']].head())

# Find worst customers (RFM = 111)
worst_customers = rfm[rfm['RFM_Score'] == '111']
print(f"\n‚ö†Ô∏è  At-risk customers (111 score): {len(worst_customers)}")
if len(worst_customers) > 0:
    print(worst_customers[['CustomerID', 'Recency', 'Frequency', 'Monetary']].head())

# Top 10 customers by Monetary value
print(f"\nüí∞ Top 10 customers by total spending:")
top_spenders = rfm.nlargest(10, 'Monetary')[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'RFM_Score']]
print(top_spenders)

# Distribution of RFM scores
print(f"\nüìä Most common RFM score combinations:")
print(rfm['RFM_Score'].value_counts().head(10))

print("\n‚úÖ Phase 2 Complete - RFM values calculated and scored!")
print("\nüéØ Ready for Phase 3: Customer Segmentation (Clustering)")

In [None]:
# === SAVE RFM DATA ===

# Save to CSV for future use
rfm.to_csv('rfm_data.csv', index=False)
print("‚úÖ RFM data saved to 'rfm_data.csv'")

# Also keep it in memory for Phase 3
print(f"‚úÖ RFM dataframe ready in variable 'rfm' with {len(rfm)} customers")

In [None]:
# === PHASE 3: Customer Segmentation (K-Means Clustering) ===
# Prepare Data for Clustering

from sklearn.preprocessing import StandardScaler

print("Preparing data for clustering...\n")

# Select only the RFM columns (not the scores, not CustomerID)
rfm_values = rfm[['Recency', 'Frequency', 'Monetary']].copy()

print(f"Selected {len(rfm_values)} customers with 3 features (R, F, M)")
print(f"\nBefore normalization - value ranges:")
print(rfm_values.describe())

# Normalize the data using StandardScaler
# This converts all values to have mean=0 and std=1
scaler = StandardScaler()
rfm_normalized = scaler.fit_transform(rfm_values)

# Convert back to DataFrame for easier handling
rfm_normalized_df = pd.DataFrame(rfm_normalized, columns=['Recency', 'Frequency', 'Monetary'])

print(f"\nAfter normalization - all features now on same scale:")
print(rfm_normalized_df.describe())

print("\n‚úÖ Data normalized and ready for clustering!")

In [None]:
# === Determine Optimal Number of Clusters ===

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

print("Testing different numbers of clusters...\n")

# Test k from 2 to 10
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(rfm_normalized)
    
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(rfm_normalized, kmeans.labels_))
    
    print(f"k={k}: Inertia={kmeans.inertia_:.2f}, Silhouette Score={silhouette_scores[-1]:.3f}")

# Plot Elbow Method
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Within-cluster sum of squares)')
plt.title('Elbow Method - Looking for the "Elbow"')
plt.grid(True)

# Plot Silhouette Scores
plt.subplot(1, 2, 2)
plt.plot(K_range, silhouette_scores, 'ro-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score - Higher is Better')
plt.grid(True)

plt.tight_layout()
plt.savefig('optimal_clusters.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nüìä Plots saved as 'optimal_clusters.png'")
print(f"\nüí° Recommendation: Look for k where:")
print(f"   - Elbow curve starts to flatten (diminishing returns)")
print(f"   - Silhouette score is reasonably high")
print(f"   - Business constraint: 4-6 clusters preferred")

print("\n‚úÖ Cluster optimization analysis complete!")

In [None]:
# === Run K-Means Clustering ===

# DECISION: Choose k based on elbow + silhouette + business need
# For this example, we'll use k=5 (you can change this after seeing the plots)
optimal_k = 5

print(f"Running K-Means with k={optimal_k} clusters...\n")

# Run final K-Means
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
rfm['Cluster'] = kmeans_final.fit_predict(rfm_normalized)

print(f"Clustering complete!")
print(f"\nCluster distribution:")
print(rfm['Cluster'].value_counts().sort_index())

print(f"\nSample of customers with cluster assignments:")
print(rfm[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'RFM_Score', 'Cluster']].head(10))

print("\n‚úÖ Customers assigned to clusters!")

In [None]:
# === Profile Each Segment ===

print("Profiling each cluster...\n")

# Calculate average RFM values for each cluster
cluster_profile = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': ['mean', 'sum'],
    'CustomerID': 'count'
}).round(2)

# Flatten column names
cluster_profile.columns = ['Recency_Avg', 'Frequency_Avg', 'Monetary_Avg', 'Monetary_Total', 'Customer_Count']

# Calculate % of total revenue
total_revenue = rfm['Monetary'].sum()
cluster_profile['Revenue_%'] = (cluster_profile['Monetary_Total'] / total_revenue * 100).round(1)

# Calculate % of total customers
total_customers = len(rfm)
cluster_profile['Customer_%'] = (cluster_profile['Customer_Count'] / total_customers * 100).round(1)

print("Cluster Profiles:")
print(cluster_profile)
print(f"\n{'='*80}")

# Also show RFM_Score distribution within each cluster
print("\nMost common RFM scores in each cluster:")
for i in range(5):
    print(f"\nCluster {i}:")
    top_scores = rfm[rfm['Cluster']==i]['RFM_Score'].value_counts().head(3)
    print(top_scores)

print("\n‚úÖ Cluster profiling complete!")

In [None]:
# === Visualize Segments ===

print("Creating segment visualizations...\n")

# Create scatter plots
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Plot 1: Recency vs Frequency
axes[0, 0].scatter(rfm['Recency'], rfm['Frequency'], c=rfm['Cluster'], cmap='viridis', alpha=0.6)
axes[0, 0].set_xlabel('Recency (days)')
axes[0, 0].set_ylabel('Frequency (purchases)')
axes[0, 0].set_title('Recency vs Frequency by Cluster')
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Recency vs Monetary
axes[0, 1].scatter(rfm['Recency'], rfm['Monetary'], c=rfm['Cluster'], cmap='viridis', alpha=0.6)
axes[0, 1].set_xlabel('Recency (days)')
axes[0, 1].set_ylabel('Monetary ($)')
axes[0, 1].set_title('Recency vs Monetary by Cluster')
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Frequency vs Monetary
axes[1, 0].scatter(rfm['Frequency'], rfm['Monetary'], c=rfm['Cluster'], cmap='viridis', alpha=0.6)
axes[1, 0].set_xlabel('Frequency (purchases)')
axes[1, 0].set_ylabel('Monetary ($)')
axes[1, 0].set_title('Frequency vs Monetary by Cluster')
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Cluster distribution
cluster_counts = rfm['Cluster'].value_counts().sort_index()
axes[1, 1].bar(cluster_counts.index, cluster_counts.values, color='skyblue', edgecolor='black')
axes[1, 1].set_xlabel('Cluster')
axes[1, 1].set_ylabel('Number of Customers')
axes[1, 1].set_title('Customer Distribution Across Clusters')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('customer_segments_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Visualizations created and saved!")
print("\nüéØ Phase 3 Complete - Customer Segments Identified!")

In [None]:
# === PHASE 4: Business Recommendation ===
# Assign Business Names to Clusters

print("Assigning business names to segments...\n")

# Map cluster numbers to business-friendly names
segment_names = {
    0: 'Core Customers',
    1: 'At-Risk/Lost',
    2: 'Champions',
    3: 'Super VIPs',
    4: 'Mega Whales'
}

# Add segment name column
rfm['Segment'] = rfm['Cluster'].map(segment_names)

print("Segment distribution:")
print(rfm['Segment'].value_counts())

print(f"\nSample customers with segment names:")
print(rfm[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'RFM_Score', 'Segment']].head(15))

# Save the final segmented data
rfm.to_csv('customer_segments_final.csv', index=False)
print("\n‚úÖ Segmented data saved to 'customer_segments_final.csv'")

In [None]:
# === Quantify Business Opportunities ===

print("\n" + "="*80)
print("BUSINESS OPPORTUNITY ANALYSIS")
print("="*80 + "\n")

# Opportunity 1: Win back At-Risk customers
at_risk = rfm[rfm['Segment'] == 'At-Risk/Lost']
at_risk_revenue = at_risk['Monetary'].sum()
at_risk_avg_spend = at_risk['Monetary'].mean()
at_risk_count = len(at_risk)

print("üéØ OPPORTUNITY 1: Win-Back Campaign for At-Risk Customers")
print(f"Current state:")
print(f"  - {at_risk_count} customers haven't purchased in 8+ months")
print(f"  - Average lifetime value: ${at_risk_avg_spend:.2f}")
print(f"  - Total potential revenue at risk: ${at_risk_revenue:,.2f}")
print(f"\nScenario: Win-back campaign with 15% success rate")
recovered_customers = int(at_risk_count * 0.15)
recovered_revenue = recovered_customers * at_risk_avg_spend
print(f"  - Customers recovered: {recovered_customers}")
print(f"  - Additional annual revenue: ${recovered_revenue:,.2f}")
print(f"  - ROI if campaign costs $10K: {(recovered_revenue / 10000 - 1) * 100:.0f}%")

# Opportunity 2: Increase Core Customer frequency
core = rfm[rfm['Segment'] == 'Core Customers']
core_avg_frequency = core['Frequency'].mean()
core_avg_monetary = core['Monetary'].mean()
core_count = len(core)

print(f"\n{'='*80}\n")
print("üéØ OPPORTUNITY 2: Increase Core Customer Purchase Frequency")
print(f"Current state:")
print(f"  - {core_count} customers with avg {core_avg_frequency:.1f} purchases/year")
print(f"  - Average customer value: ${core_avg_monetary:.2f}")
print(f"\nScenario: Increase purchase frequency by 20% (email campaigns, loyalty rewards)")
additional_purchases = core_count * core_avg_frequency * 0.20
revenue_per_purchase = core_avg_monetary / core_avg_frequency
additional_revenue = additional_purchases * revenue_per_purchase
print(f"  - Additional purchases: {additional_purchases:.0f}")
print(f"  - Additional annual revenue: ${additional_revenue:,.2f}")

# Opportunity 3: Protect Champions (churn prevention)
champions = rfm[rfm['Segment'] == 'Champions']
champions_revenue = champions['Monetary'].sum()
champions_count = len(champions)

print(f"\n{'='*80}\n")
print("üéØ OPPORTUNITY 3: Champions Retention Program")
print(f"Current state:")
print(f"  - {champions_count} Champions generate ${champions_revenue:,.2f} (30.7% of revenue)")
print(f"  - Industry avg churn: 10-15% annually")
print(f"\nScenario: VIP program reduces churn from 12% to 5%")
prevented_churn = champions_count * 0.07  # 7% reduction
retained_revenue = (prevented_churn / champions_count) * champions_revenue
print(f"  - Customers retained: {prevented_churn:.0f}")
print(f"  - Revenue protected: ${retained_revenue:,.2f}")
print(f"  - ROI if program costs $50K: {(retained_revenue / 50000 - 1) * 100:.0f}%")

# Opportunity 4: Protect Whales
whales = rfm[rfm['Segment'].isin(['Super VIPs', 'Mega Whales'])]
whales_revenue = whales['Monetary'].sum()
whales_count = len(whales)

print(f"\n{'='*80}\n")
print("üéØ OPPORTUNITY 4: White-Glove Service for Top 14 Customers")
print(f"Current state:")
print(f"  - {whales_count} customers generate ${whales_revenue:,.2f} (17.9% of revenue)")
print(f"  - Losing ONE Mega Whale = ${rfm[rfm['Segment']=='Mega Whales']['Monetary'].mean():,.2f} loss")
print(f"\nScenario: Dedicated account manager prevents 1 whale churn")
whale_avg_value = whales['Monetary'].mean()
print(f"  - Value protected by preventing 1 churn: ${whale_avg_value:,.2f}")
print(f"  - ROI if dedicated manager costs $80K/year: {(whale_avg_value / 80000 - 1) * 100:.0f}%")

# Total opportunity summary
print(f"\n{'='*80}")
print("üí∞ TOTAL ANNUAL OPPORTUNITY SUMMARY")
print(f"{'='*80}")
total_opportunity = recovered_revenue + additional_revenue + retained_revenue + whale_avg_value
print(f"Win-back campaign:           ${recovered_revenue:>12,.2f}")
print(f"Frequency increase:          ${additional_revenue:>12,.2f}")
print(f"Champions retention:         ${retained_revenue:>12,.2f}")
print(f"Whale protection:            ${whale_avg_value:>12,.2f}")
print(f"{'-'*80}")
print(f"TOTAL REVENUE OPPORTUNITY:   ${total_opportunity:>12,.2f}")
print(f"{'='*80}\n")

print("‚úÖ Business opportunity analysis complete!")

In [None]:
# === Strategic Recommendations by Segment ===

print("\n" + "="*80)
print("SEGMENT-SPECIFIC MARKETING STRATEGIES")
print("="*80 + "\n")

# Create recommendations dictionary
strategies = {
    'Segment': ['Core Customers', 'At-Risk/Lost', 'Champions', 'Super VIPs', 'Mega Whales'],
    'Size': ['3,049 (70.3%)', '1,062 (24.5%)', '213 (4.9%)', '8 (0.2%)', '6 (0.1%)'],
    'Revenue_Share': ['45.8%', '5.7%', '30.7%', '5.0%', '12.9%'],
    'Marketing_Strategy': [
        'Regular email campaigns, product recommendations, seasonal promotions',
        'Win-back campaign: "We miss you" emails, 20% discount for return',
        'VIP loyalty program, early access to new products, exclusive discounts',
        'Dedicated account manager, quarterly business reviews, custom solutions',
        'Personal relationship with CEO, custom pricing, white-glove service'
    ],
    'Communication_Frequency': [
        'Bi-weekly emails',
        'One-time campaign then remove if no response',
        'Weekly engagement',
        'Monthly check-ins',
        'Continuous personal contact'
    ],
    'Budget_Allocation': [
        '40%',
        '10%',
        '25%',
        '10%',
        '15%'
    ],
    'Priority': [
        'Medium',
        'Low (test campaign only)',
        'High',
        'Critical',
        'Critical'
    ]
}

strategies_df = pd.DataFrame(strategies)

print(strategies_df.to_string(index=False))

# Save to CSV
strategies_df.to_csv('segment_strategies.csv', index=False)
print(f"\n‚úÖ Strategies saved to 'segment_strategies.csv'\n")

print("="*80)
print("‚úÖ PHASE 4 COMPLETE - Business Recommendations Generated!")
print("="*80)

In [None]:
# === PORTFOLIO VISUALIZATIONS ===

import matplotlib.pyplot as plt
import seaborn as sns

print("Creating portfolio-ready visualizations...\n")

# Set style for professional look
sns.set_style("whitegrid")
plt.rcParams['figure.facecolor'] = 'white'

# ============================================================================
# VISUALIZATION 1: Segment Revenue Contribution (For Portfolio Page 2)
# ============================================================================

fig, ax = plt.subplots(figsize=(10, 6))

# Data for pie chart
segments = ['Core Customers', 'At-Risk/Lost', 'Champions', 'Super VIPs', 'Mega Whales']
revenue_pct = [45.8, 5.7, 30.7, 5.0, 12.9]
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6']

# Create pie chart
wedges, texts, autotexts = ax.pie(revenue_pct, 
                                    labels=segments, 
                                    autopct='%1.1f%%',
                                    colors=colors,
                                    startangle=90,
                                    textprops={'fontsize': 12, 'weight': 'bold'})

# Make percentage text white and bold
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(11)
    autotext.set_weight('bold')

ax.set_title('Revenue Distribution by Customer Segment', fontsize=16, weight='bold', pad=20)

plt.tight_layout()
plt.savefig('segment_revenue_pie.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

print("‚úÖ Saved: segment_revenue_pie.png")

# ============================================================================
# VISUALIZATION 2: Segment Profile Heatmap (For Portfolio Page 3)
# ============================================================================

fig, ax = plt.subplots(figsize=(10, 6))

# Create segment profile data (normalized for heatmap)
segment_data = {
    'Segment': ['Core Customers', 'At-Risk/Lost', 'Champions', 'Super VIPs', 'Mega Whales'],
    'Recency_Score': [5, 1, 5, 5, 5],  # Lower recency = higher score
    'Frequency_Score': [3, 1, 5, 5, 4],
    'Monetary_Score': [2, 1, 5, 5, 5]
}

# Create matrix for heatmap
heatmap_data = []
for i in range(len(segment_data['Segment'])):
    heatmap_data.append([
        segment_data['Recency_Score'][i],
        segment_data['Frequency_Score'][i],
        segment_data['Monetary_Score'][i]
    ])

# Plot heatmap
sns.heatmap(heatmap_data, 
            annot=True, 
            fmt='d',
            cmap='RdYlGn',
            cbar_kws={'label': 'Score (1=Low, 5=High)'},
            xticklabels=['Recency', 'Frequency', 'Monetary'],
            yticklabels=segment_data['Segment'],
            linewidths=2,
            linecolor='white',
            vmin=1,
            vmax=5,
            ax=ax)

ax.set_title('Customer Segment Profiles (RFM Scores)', fontsize=16, weight='bold', pad=20)
ax.set_xlabel('RFM Metrics', fontsize=12, weight='bold')
ax.set_ylabel('Customer Segment', fontsize=12, weight='bold')

plt.tight_layout()
plt.savefig('segment_profile_heatmap.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

print("‚úÖ Saved: segment_profile_heatmap.png")

# ============================================================================
# VISUALIZATION 3: Business Opportunities Bar Chart (For Portfolio Page 3)
# ============================================================================

fig, ax = plt.subplots(figsize=(10, 6))

opportunities = ['Win-Back\nCampaign', 'Frequency\nIncrease', 'Champions\nRetention', 'Whale\nProtection']
values = [76019, 816563, 191321, 113406]
colors_bars = ['#e74c3c', '#3498db', '#2ecc71', '#9b59b6']

bars = ax.bar(opportunities, values, color=colors_bars, edgecolor='black', linewidth=1.5)

# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, values)):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'${value:,.0f}',
            ha='center', va='bottom', fontsize=11, weight='bold')

ax.set_ylabel('Annual Revenue Opportunity ($)', fontsize=12, weight='bold')
ax.set_title('Identified Business Opportunities', fontsize=16, weight='bold', pad=20)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
ax.grid(axis='y', alpha=0.3)

# Add total line
total = sum(values)
ax.axhline(y=total/4, color='red', linestyle='--', linewidth=2, alpha=0.5)
ax.text(len(opportunities)-0.5, total/4, f'Total: ${total:,.0f}', 
        fontsize=11, weight='bold', color='red', 
        bbox=dict(boxstyle='round', facecolor='white', edgecolor='red'))

plt.tight_layout()
plt.savefig('business_opportunities_bar.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

print("‚úÖ Saved: business_opportunities_bar.png")

print("\n" + "="*80)
print("‚úÖ ALL PORTFOLIO VISUALIZATIONS CREATED!")
print("="*80)
print("Data:")
print("  1. segment_revenue_pie.png")
print("  2. segment_profile_heatmap.png")
print("  3. business_opportunities_bar.png")
print("  4. customer_segments_visualization.png (from Phase 3)")
print("  5. optimal_clusters.png (from Phase 3)")
print("\nProject Done!!!")

In [None]:
# === FINAL VALIDATION ===

print("="*80)
print("SANITY CHECKS - Verify Results Make Business Sense")
print("="*80 + "\n")

# Check 1: Do segment sizes add up to total customers?
total_customers = len(rfm)
segment_sum = rfm['Segment'].value_counts().sum()
assert total_customers == segment_sum, "‚ùå Segment counts don't match total!"
print(f"‚úÖ Check 1 Passed: {total_customers} customers = {segment_sum} in segments")

# Check 2: Does revenue add up correctly?
total_revenue = rfm['Monetary'].sum()
segment_revenue = rfm.groupby('Segment')['Monetary'].sum().sum()
assert abs(total_revenue - segment_revenue) < 0.01, "‚ùå Revenue mismatch!"
print(f"‚úÖ Check 2 Passed: Total revenue ${total_revenue:,.2f} matches segment sum")

# Check 3: Are Champions actually better than At-Risk?
champions_avg = rfm[rfm['Segment']=='Champions']['Monetary'].mean()
at_risk_avg = rfm[rfm['Segment']=='At-Risk/Lost']['Monetary'].mean()
assert champions_avg > at_risk_avg, "‚ùå Champions should spend more than At-Risk!"
print(f"‚úÖ Check 3 Passed: Champions (${champions_avg:,.2f}) > At-Risk (${at_risk_avg:,.2f})")

# Check 4: Do the top 5% really generate ~50% of revenue?
top_5pct_count = int(len(rfm) * 0.05)
top_5pct_revenue = rfm.nlargest(top_5pct_count, 'Monetary')['Monetary'].sum()
top_5pct_pct = (top_5pct_revenue / total_revenue) * 100
print(f"‚úÖ Check 4: Top 5% of customers = {top_5pct_pct:.1f}% of revenue (expected ~50%)")

print("\n" + "="*80)
print("‚úÖ ALL SANITY CHECKS PASSED - Results are valid!")
print("="*80)