# ML Model Analysis & Results

This notebook analyzes the results of the three ML models trained on the clickstream data:
1. **Purchase Conversion Prediction** - Which sessions will convert?
2. **Customer Churn Prediction** - Which users are at risk of leaving?
3. **Customer Lifetime Value (CLV)** - How much revenue will each customer generate?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.4f}'.format)

print('Libraries loaded successfully')

## 1. Load ML Predictions & Metrics

In [None]:
# Load predictions
ml_path = Path('../data/ml_output')

conversion_preds = pd.read_parquet(ml_path / 'predictions' / 'conversion_predictions.parquet')
churn_preds = pd.read_parquet(ml_path / 'predictions' / 'churn_predictions.parquet')
ltv_preds = pd.read_parquet(ml_path / 'predictions' / 'ltv_predictions.parquet')

# Load metrics
with open(ml_path / 'metrics' / 'conversion_metrics.json') as f:
    conversion_metrics = json.load(f)
with open(ml_path / 'metrics' / 'churn_metrics.json') as f:
    churn_metrics = json.load(f)
with open(ml_path / 'metrics' / 'ltv_metrics.json') as f:
    ltv_metrics = json.load(f)

print(f'Conversion predictions: {len(conversion_preds):,} sessions')
print(f'Churn predictions: {len(churn_preds):,} users')
print(f'LTV predictions: {len(ltv_preds):,} users')

## 2. Model Performance Summary

In [None]:
# Classification models summary
print('=' * 60)
print('CLASSIFICATION MODELS PERFORMANCE')
print('=' * 60)

summary = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'CV ROC-AUC'],
    'Conversion Model': [
        conversion_metrics['accuracy'],
        conversion_metrics['precision'],
        conversion_metrics['recall'],
        conversion_metrics['f1'],
        conversion_metrics['roc_auc'],
        conversion_metrics['cv_roc_auc_mean'],
    ],
    'Churn Model': [
        churn_metrics['accuracy'],
        churn_metrics['precision'],
        churn_metrics['recall'],
        churn_metrics['f1'],
        churn_metrics['roc_auc'],
        churn_metrics['cv_roc_auc_mean'],
    ]
}).set_index('Metric')

display(summary.style.format('{:.4f}').background_gradient(cmap='Greens', axis=None))

print('\n' + '=' * 60)
print('REGRESSION MODEL PERFORMANCE (LTV)')
print('=' * 60)
print(f"RMSE:  ${ltv_metrics['rmse']:.2f}")
print(f"MAE:   ${ltv_metrics['mae']:.2f}")
print(f"R\u00b2:    {ltv_metrics['r2']:.4f}")
print(f"MAPE:  {ltv_metrics['mape']:.2f}%")

## 3. Conversion Model Analysis

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Purchase Conversion Prediction Analysis', fontsize=14, fontweight='bold')

# Probability distribution
axes[0].hist(conversion_preds['conversion_probability'], bins=50, alpha=0.7, edgecolor='black')
axes[0].axvline(0.5, color='red', linestyle='--', label='Threshold')
axes[0].set_xlabel('Conversion Probability')
axes[0].set_ylabel('Count')
axes[0].set_title('Probability Distribution')
axes[0].legend()

# Risk category breakdown
risk_counts = conversion_preds['conversion_risk'].value_counts()
colors = {'Low': '#2ecc71', 'Medium': '#f39c12', 'High': '#e74c3c'}
axes[1].bar(risk_counts.index, risk_counts.values,
            color=[colors.get(x, 'gray') for x in risk_counts.index])
axes[1].set_ylabel('Number of Sessions')
axes[1].set_title('Conversion Risk Categories')

# Actual conversion rate by risk category
conv_by_risk = conversion_preds.groupby('conversion_risk')['actual_converted'].mean() * 100
axes[2].bar(conv_by_risk.index, conv_by_risk.values, color='steelblue')
axes[2].set_ylabel('Actual Conversion Rate (%)')
axes[2].set_title('Actual Conversion by Risk Category')

plt.tight_layout()
plt.show()

# Top features
print('\nTop 10 Features for Conversion Prediction:')
top_features = pd.Series(conversion_metrics['feature_importance']).nlargest(10)
display(top_features.to_frame('Importance').style.format('{:.4f}').bar(color='steelblue'))

## 4. Churn Model Analysis

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Customer Churn Prediction Analysis', fontsize=14, fontweight='bold')

# Probability distribution
axes[0].hist(churn_preds['churn_probability'], bins=50, alpha=0.7, edgecolor='black', color='purple')
axes[0].axvline(0.5, color='red', linestyle='--', label='Threshold')
axes[0].set_xlabel('Churn Probability')
axes[0].set_ylabel('Count')
axes[0].set_title('Churn Probability Distribution')
axes[0].legend()

# Risk breakdown
churn_risk_counts = churn_preds['churn_risk'].value_counts()
axes[1].bar(churn_risk_counts.index, churn_risk_counts.values,
            color=[colors.get(x, 'gray') for x in churn_risk_counts.index])
axes[1].set_ylabel('Number of Users')
axes[1].set_title('Churn Risk Categories')

# Retention priority distribution
axes[2].hist(churn_preds['retention_priority'], bins=50, alpha=0.7, edgecolor='black', color='coral')
axes[2].set_xlabel('Retention Priority Score')
axes[2].set_ylabel('Count')
axes[2].set_title('Retention Priority Distribution')

plt.tight_layout()
plt.show()

# Top features
print('\nTop 10 Features for Churn Prediction:')
top_features = pd.Series(churn_metrics['feature_importance']).nlargest(10)
display(top_features.to_frame('Importance').style.format('{:.4f}').bar(color='purple'))

## 5. Customer Lifetime Value Analysis

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Customer Lifetime Value (CLV) Analysis', fontsize=14, fontweight='bold')

# Actual vs Predicted
sample = ltv_preds.sample(min(1000, len(ltv_preds)), random_state=42)
axes[0].scatter(sample['actual_revenue'], sample['predicted_ltv_90days'], alpha=0.5, s=10)
max_val = max(sample['actual_revenue'].max(), sample['predicted_ltv_90days'].max())
axes[0].plot([0, max_val], [0, max_val], 'r--', label='Perfect Prediction')
axes[0].set_xlabel('Actual Revenue ($)')
axes[0].set_ylabel('Predicted LTV ($)')
axes[0].set_title('Actual vs Predicted Revenue')
axes[0].legend()

# LTV category distribution
ltv_counts = ltv_preds['ltv_category'].value_counts()
ltv_colors = ['#d3d3d3', '#90ee90', '#ffa500', '#ff4500']
axes[1].bar(range(len(ltv_counts)), ltv_counts.values, color=ltv_colors[:len(ltv_counts)])
axes[1].set_xticks(range(len(ltv_counts)))
axes[1].set_xticklabels(ltv_counts.index)
axes[1].set_ylabel('Number of Users')
axes[1].set_title('LTV Category Distribution')

# Revenue by LTV category
revenue_by_cat = ltv_preds.groupby('ltv_category')['predicted_ltv_90days'].mean()
axes[2].bar(revenue_by_cat.index, revenue_by_cat.values, color='green', alpha=0.7)
axes[2].set_ylabel('Avg Predicted LTV ($)')
axes[2].set_title('Average LTV by Category')

plt.tight_layout()
plt.show()

# Top features
print('\nTop 10 Features for LTV Prediction:')
top_features = pd.Series(ltv_metrics['feature_importance']).nlargest(10)
display(top_features.to_frame('Importance').style.format('{:.4f}').bar(color='green'))

## 6. Business Insights & Actionable Recommendations

In [None]:
# High-risk churn users for retention campaigns
high_risk_churn = churn_preds[
    (churn_preds['churn_risk'] == 'High') &
    (churn_preds['total_revenue'] > 200)
].sort_values('retention_priority', ascending=False)

print('=' * 60)
print('TOP 20 USERS FOR RETENTION CAMPAIGNS')
print('(High churn risk + High revenue)')
print('=' * 60)
display(high_risk_churn[['user_id', 'churn_probability', 'total_revenue', 'retention_priority']].head(20))

print(f'\nTotal high-risk valuable users: {len(high_risk_churn)}')
print(f'At-risk revenue: ${high_risk_churn["total_revenue"].sum():,.2f}')

In [None]:
# High-conversion probability sessions
high_conv = conversion_preds[conversion_preds['conversion_probability'] > 0.7]

print('=' * 60)
print('HIGH-PROBABILITY CONVERSION SESSIONS')
print('=' * 60)
print(f'Sessions with >70% conversion probability: {len(high_conv):,}')
print(f'Actual conversion rate in this group: {high_conv["actual_converted"].mean()*100:.1f}%')
print(f'\nThese sessions should be targeted with real-time promotions.')

In [None]:
# VIP customers
vip = ltv_preds[ltv_preds['ltv_category'] == 'VIP'].sort_values('predicted_ltv_90days', ascending=False)

print('=' * 60)
print('VIP CUSTOMER SEGMENT')
print('=' * 60)
print(f'Number of VIP customers: {len(vip):,}')
print(f'Expected 90-day revenue from VIP segment: ${vip["predicted_ltv_90days"].sum():,.2f}')
print(f'Average VIP LTV: ${vip["predicted_ltv_90days"].mean():,.2f}')
print(f'\nTop 10 VIP customers:')
display(vip[['user_id', 'actual_revenue', 'predicted_ltv_90days', 'ltv_category']].head(10))

## 7. Key Takeaways

### Model Performance
- **Conversion Model**: Effectively identifies sessions likely to purchase using behavioral signals
- **Churn Model**: Detects at-risk users based on engagement and recency patterns
- **LTV Model**: Forecasts customer revenue based on historical purchase behavior

### Business Actions
1. **Conversion Optimization**: Target high-probability sessions with personalized offers
2. **Churn Reduction**: Launch retention campaigns for high-risk valuable users
3. **VIP Treatment**: Provide premium experiences for top LTV customers
4. **Budget Allocation**: Invest acquisition spend proportional to predicted LTV