# üéØ Day 4: Explainability + Segmentation + Retention Strategy

**Customer Churn Analytics Project**

This notebook covers:
1. Customer Risk Scoring
2. Global Feature Importance
3. Local Explainability
4. Customer Segmentation (KMeans + PCA)
5. Retention Strategy Engine

In [1]:
# Imports
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path

# Project modules
from config import OUTPUT_DIR
from preprocess import load_clean_data
from features import prepare_features, get_feature_names
from explain import (
    load_best_model, compute_risk_scores, save_risk_scores,
    get_global_feature_importance, plot_global_importance,
    explain_high_risk_customers
)
from segmentation import (
    prepare_features_for_clustering, apply_pca,
    perform_kmeans_clustering, segment_customers,
    plot_pca_clusters, plot_segment_profiles, save_segments,
    SEGMENTATION_FEATURES
)
from retention_strategy import (
    generate_retention_actions, summarize_retention_actions,
    print_retention_summary, save_retention_actions
)

print("‚úì All modules loaded successfully!")

SHAP not available, will use permutation importance
‚úì All modules loaded successfully!


## 1. Load Model and Data

In [2]:
# Load best model
model = load_best_model()
print(f"Model loaded: {type(model).__name__}")

Model loaded: Pipeline


In [3]:
# Load cleaned dataset
df = load_clean_data()
print(f"\nData shape: {df.shape}")
print(f"Churn rate: {df['churn'].mean()*100:.2f}%")

Loaded cleaned dataset: 5,000 rows √ó 17 columns

Data shape: (5000, 17)
Churn rate: 21.94%


In [4]:
# Prepare features
df_features = prepare_features(df, create_interactions=True, apply_log=True)

# Get feature columns
numeric_features, categorical_features = get_feature_names(
    include_interactions=True, include_log=True
)

# Filter to available
numeric_features = [f for f in numeric_features if f in df_features.columns]
categorical_features = [f for f in categorical_features if f in df_features.columns]

all_features = numeric_features + categorical_features
X = df_features[all_features]

print(f"Features: {len(all_features)}")

Created interaction features: sessions_per_crash, payment_failure_rate, support_per_session, avg_minutes_per_session
Applied log1p transform to 7 columns
Features: 26


## 2. Customer Risk Scoring

In [5]:
# Compute risk scores for all customers
risk_scores = compute_risk_scores(
    model, X, 
    customer_ids=df['customer_id'],
    threshold=0.5
)

print(f"\nüìä Risk Score Summary:")
print(f"   Total customers: {len(risk_scores):,}")
print(f"   High risk (>=70%): {(risk_scores['churn_probability'] >= 0.7).sum():,}")
print(f"   Medium risk (30-70%): {((risk_scores['churn_probability'] >= 0.3) & (risk_scores['churn_probability'] < 0.7)).sum():,}")
print(f"   Low risk (<30%): {(risk_scores['churn_probability'] < 0.3).sum():,}")


üìä Risk Score Summary:
   Total customers: 5,000
   High risk (>=70%): 1,097
   Medium risk (30-70%): 0
   Low risk (<30%): 3,903


In [6]:
# Preview top risk customers
print("\nüö® Top 10 High-Risk Customers:")
display(risk_scores.head(10))


üö® Top 10 High-Risk Customers:


Unnamed: 0,customer_id,churn_probability,churn_prediction,risk_category
4156,C004157,1.0,1,Very High
1887,C001888,1.0,1,Very High
1234,C001235,1.0,1,Very High
1664,C001665,1.0,1,Very High
2633,C002634,1.0,1,Very High
1572,C001573,1.0,1,Very High
4908,C004909,1.0,1,Very High
1779,C001780,1.0,1,Very High
3065,C003066,1.0,1,Very High
3730,C003731,1.0,1,Very High


In [7]:
# Save risk scores
save_risk_scores(risk_scores)

‚úì Saved customer scores: C:\Users\Lenovo\Desktop\churn\churn_project\outputs\customer_scores.csv


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/customer_scores.csv')

## 3. Global Feature Importance

In [8]:
# Get global feature importance
importance_df = get_global_feature_importance(model, all_features)

print("\nüîù Top 10 Most Important Features:")
display(importance_df.head(10))


üîù Top 10 Most Important Features:


Unnamed: 0,feature,importance
3,num__total_sessions_30d,2.532245
4,num__avg_session_minutes_30d,2.212337
15,num__avg_session_minutes_30d_log,1.472334
14,num__total_sessions_30d_log,1.218215
2,num__auto_renew,0.964961
13,num__avg_minutes_per_session,0.713551
10,num__sessions_per_crash,0.698469
5,num__total_crashes_30d,0.469092
9,num__avg_resolution_time_30d,0.404964
21,cat__gender_Female,0.346182


In [9]:
# Plot global importance
plots_dir = OUTPUT_DIR / 'plots'
plot_global_importance(importance_df, top_n=15, plots_dir=plots_dir)

  ‚úì Saved: global_feature_importance.png


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/plots/global_feature_importance.png')

## 4. Local Explainability

In [10]:
# Explain top 5 high-risk customers
explanations = explain_high_risk_customers(
    model, X, risk_scores,
    top_n=5,
    feature_importance=importance_df
)

print("\nüìã Top 5 High-Risk Customer Explanations:")
display(explanations)


üìã Top 5 High-Risk Customer Explanations:


Unnamed: 0,customer_id,churn_probability,risk_category,top_risk_factors,key_features
0,C004157,1.0,Very High,[],"num__total_sessions_30d, num__avg_session_minu..."
1,C001888,1.0,Very High,[],"num__total_sessions_30d, num__avg_session_minu..."
2,C001235,1.0,Very High,[],"num__total_sessions_30d, num__avg_session_minu..."
3,C001665,1.0,Very High,[],"num__total_sessions_30d, num__avg_session_minu..."
4,C002634,1.0,Very High,[],"num__total_sessions_30d, num__avg_session_minu..."


In [11]:
# For each high-risk customer, show their key feature values
print("\nüîç Detailed Analysis of Top 5 High-Risk Customers:\n")

top_5_ids = risk_scores.head(5)['customer_id'].tolist()
key_features = ['total_sessions_30d', 'failed_payments_30d', 'support_tickets_30d', 
                'monthly_price', 'avg_session_minutes_30d']

for cust_id in top_5_ids:
    cust_data = df[df['customer_id'] == cust_id].iloc[0]
    prob = risk_scores[risk_scores['customer_id'] == cust_id]['churn_probability'].values[0]
    
    print(f"Customer: {cust_id} | Risk: {prob*100:.1f}%")
    for feat in key_features:
        if feat in cust_data.index:
            print(f"   {feat}: {cust_data[feat]}")
    print()


üîç Detailed Analysis of Top 5 High-Risk Customers:

Customer: C004157 | Risk: 100.0%
   total_sessions_30d: 1
   failed_payments_30d: 0
   support_tickets_30d: 1
   monthly_price: 499
   avg_session_minutes_30d: 1.16

Customer: C001888 | Risk: 100.0%
   total_sessions_30d: 0
   failed_payments_30d: 0
   support_tickets_30d: 2
   monthly_price: 499
   avg_session_minutes_30d: 0.0

Customer: C001235 | Risk: 100.0%
   total_sessions_30d: 2
   failed_payments_30d: 0
   support_tickets_30d: 2
   monthly_price: 999
   avg_session_minutes_30d: 1.3

Customer: C001665 | Risk: 100.0%
   total_sessions_30d: 0
   failed_payments_30d: 0
   support_tickets_30d: 2
   monthly_price: 199
   avg_session_minutes_30d: 0.0

Customer: C002634 | Risk: 100.0%
   total_sessions_30d: 0
   failed_payments_30d: 0
   support_tickets_30d: 2
   monthly_price: 499
   avg_session_minutes_30d: 0.0



## 5. Customer Segmentation

In [12]:
# Prepare features for clustering
print(f"Segmentation features: {SEGMENTATION_FEATURES}")

X_scaled, scaler, features_used = prepare_features_for_clustering(df)
print(f"\nUsing {len(features_used)} features for clustering")

Segmentation features: ['total_sessions_30d', 'avg_session_minutes_30d', 'failed_payments_30d', 'support_tickets_30d', 'monthly_price']

Using 5 features for clustering


In [13]:
# Apply PCA for visualization
X_pca, pca = apply_pca(X_scaled, n_components=2)

PCA with 2 components explains 59.7% of variance


In [14]:
# Perform K-Means clustering (k=4)
labels, kmeans = perform_kmeans_clustering(X_scaled, k=4)

K-Means clustering with k=4
Cluster distribution:
  Cluster 0: 3,193 customers (63.9%)
  Cluster 1: 1,024 customers (20.5%)
  Cluster 2: 627 customers (12.5%)
  Cluster 3: 156 customers (3.1%)


In [15]:
# Create segments DataFrame
segments = segment_customers(df, k=4)

print("\nüìä Segment Distribution:")
print(segments['segment_name'].value_counts())

K-Means clustering with k=4
Cluster distribution:
  Cluster 0: 3,193 customers (63.9%)
  Cluster 1: 1,024 customers (20.5%)
  Cluster 2: 627 customers (12.5%)
  Cluster 3: 156 customers (3.1%)

üìä Segment Distribution:
segment_name
High Value Engaged    3193
At Risk               1024
Low Engagement         627
Price Sensitive        156
Name: count, dtype: int64


In [16]:
# Plot PCA clusters
plot_pca_clusters(X_pca, labels, plots_dir=plots_dir)

  ‚úì Saved: pca_clusters.png


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/plots/pca_clusters.png')

In [17]:
# Plot segment profiles
plot_segment_profiles(df, segments, plots_dir=plots_dir)

  ‚úì Saved: segment_profiles.png


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/plots/segment_profiles.png')

In [18]:
# Save segments
save_segments(segments)

‚úì Saved segments: C:\Users\Lenovo\Desktop\churn\churn_project\outputs\segments.csv


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/segments.csv')

## 6. Retention Strategy Engine

In [19]:
# Generate retention actions for all customers
actions_df = generate_retention_actions(
    df=df,
    risk_scores=risk_scores,
    segments=segments
)

print(f"\nGenerated {len(actions_df):,} retention recommendations")

Generating retention recommendations...

Generated 5,000 retention recommendations


In [20]:
# Preview retention actions
print("\nüìã Sample Retention Actions:")
display(actions_df.head(10))


üìã Sample Retention Actions:


Unnamed: 0,customer_id,churn_probability,segment_id,recommended_action
0,C000001,1.1e-05,0,Offer loyalty reward + Exclusive content access
1,C000002,1.6e-05,0,Offer loyalty reward + Exclusive content access
2,C000003,2.8e-05,0,Offer loyalty reward + Exclusive content access
3,C000004,0.000109,0,Offer loyalty reward + Exclusive content access
4,C000005,0.999951,1,Offer premium discount (20%) + Schedule retent...
5,C000006,0.000108,0,Offer loyalty reward + Exclusive content access
6,C000007,1.9e-05,0,Offer loyalty reward + Exclusive content access
7,C000008,0.000194,3,Offer loyalty reward + Exclusive content access
8,C000009,0.999999,1,Offer premium discount (20%) + Schedule retent...
9,C000010,0.0008,3,Offer loyalty reward + Exclusive content access


In [21]:
# Get summary statistics
summary = summarize_retention_actions(actions_df)

# Print retention summary
print_retention_summary(actions_df, summary)


RETENTION STRATEGY SUMMARY

üìä Risk Distribution:
   High Risk (>=70%): 1,097
   Medium Risk (30-70%): 0
   Low Risk (<30%): 3,903

üéØ Customers per Segment:
   Segment 0: 3,193
   Segment 1: 1,024
   Segment 2: 627
   Segment 3: 156

üí° Top 3 Recommended Actions:
   ‚Ä¢ Offer loyalty reward + Exclusive content access: 3,903 customers
   ‚Ä¢ Send re-engagement push notification + Personalized content: 646 customers
   ‚Ä¢ Offer premium discount (20%) + Schedule retention call: 269 customers

üö® Top 20 High-Risk Customers:
   C004157: 100.0% - Send re-engagement push notification + Personalize
   C001888: 100.0% - Assign priority support + Issue apology credit (‚Çπ1
   C001235: 100.0% - Offer premium discount (20%) + Schedule retention 
   C001665: 100.0% - Assign priority support + Issue apology credit (‚Çπ1
   C002634: 100.0% - Assign priority support + Issue apology credit (‚Çπ1
   C001573: 100.0% - Send re-engagement push notification + Personalize
   C004909: 100.0% - Send

In [22]:
# Save retention actions
save_retention_actions(actions_df)

‚úì Saved retention actions: C:\Users\Lenovo\Desktop\churn\churn_project\outputs\retention_actions.csv


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/retention_actions.csv')

## 7. Final Summary

In [23]:
print("=" * 60)
print("DAY 4 SUMMARY")
print("=" * 60)

print(f"\nüìä Risk Scoring:")
print(f"   - Scored {len(risk_scores):,} customers")
print(f"   - High risk: {(risk_scores['churn_probability'] >= 0.7).sum():,}")

print(f"\nüéØ Segmentation:")
print(f"   - Created 4 customer segments")
for seg, count in segments['segment_name'].value_counts().items():
    print(f"   - {seg}: {count:,}")

print(f"\nüí° Retention Actions:")
for action, count in list(summary.get('action_distribution', {}).items())[:3]:
    print(f"   - {action[:50]}: {count:,}")

print(f"\n‚úì Saved outputs:")
print(f"   - outputs/customer_scores.csv")
print(f"   - outputs/segments.csv")
print(f"   - outputs/retention_actions.csv")
print(f"   - outputs/plots/*.png")

DAY 4 SUMMARY

üìä Risk Scoring:
   - Scored 5,000 customers
   - High risk: 1,097

üéØ Segmentation:
   - Created 4 customer segments
   - High Value Engaged: 3,193
   - At Risk: 1,024
   - Low Engagement: 627
   - Price Sensitive: 156

üí° Retention Actions:
   - Offer loyalty reward + Exclusive content access: 3,903
   - Send re-engagement push notification + Personalize: 646
   - Offer premium discount (20%) + Schedule retention : 269

‚úì Saved outputs:
   - outputs/customer_scores.csv
   - outputs/segments.csv
   - outputs/retention_actions.csv
   - outputs/plots/*.png
