In [None]:
# %% [markdown]
# # Modeling Results Analysis
# ## AlphaCare Insurance Solutions (ACIS)
# 
# This notebook analyzes the results from the `modeling.py` pipeline.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from pathlib import Path
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# %%
# Set project paths
PROJECT_ROOT = Path("D:/Python/Week-3/Insurance-Analytics-Week-3-").resolve()
print(f"Project root: {PROJECT_ROOT}")

# Set data paths
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "claim_data_prepared.csv"
MODELS_DIR = PROJECT_ROOT / "models"
REGRESSION_DIR = MODELS_DIR / "regression"
CLASSIFICATION_DIR = MODELS_DIR / "classification"
NOTEBOOKS_DIR = PROJECT_ROOT / "notebooks"
SRC_DIR = PROJECT_ROOT / "src"
REPORTS_DIR = PROJECT_ROOT / "reports"
REPORTS_DIR.mkdir(exist_ok=True)

print(f"Data path: {DATA_PATH}")
print(f"Data file exists: {DATA_PATH.exists()}")
print(f"Models directory exists: {MODELS_DIR.exists()}")
print(f"Reports directory: {REPORTS_DIR}")

# %%
# Load the prepared data
df = pd.read_csv(DATA_PATH, low_memory=False)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {len(df.columns)}")

# Show basic info
print("\nFirst 10 columns:")
for i, col in enumerate(df.columns[:10]):
    print(f"{i:2d}. {col}")

print(f"\nTarget columns present:")
for target in ['TotalClaims', 'Log_TotalClaims', 'HighClaim', 'ClaimSeverityCategory']:
    if target in df.columns:
        print(f"  ‚úì {target}")
    else:
        print(f"  ‚úó {target}")

# %%
# Basic data statistics
print("BASIC DATA STATISTICS")
print("=" * 50)

if 'TotalClaims' in df.columns:
    print("\nTotalClaims Distribution:")
    print(f"Total policies: {len(df):,}")
    print(f"Policies with claims (>0): {(df['TotalClaims'] > 0).sum():,} ({(df['TotalClaims'] > 0).mean()*100:.2f}%)")
    print(f"Policies with zero claims: {(df['TotalClaims'] == 0).sum():,} ({(df['TotalClaims'] == 0).mean()*100:.2f}%)")
    
    print(f"\nClaim Amount Statistics (for policies with claims):")
    claims_df = df[df['TotalClaims'] > 0]
    print(f"Mean claim: ${claims_df['TotalClaims'].mean():,.2f}")
    print(f"Median claim: ${claims_df['TotalClaims'].median():,.2f}")
    print(f"Std claim: ${claims_df['TotalClaims'].std():,.2f}")
    print(f"Min claim: ${claims_df['TotalClaims'].min():,.2f}")
    print(f"Max claim: ${claims_df['TotalClaims'].max():,.2f}")

if 'HighClaim' in df.columns:
    print(f"\nHighClaim Distribution:")
    print(f"High claims: {df['HighClaim'].sum():,} ({df['HighClaim'].mean()*100:.2f}%)")
    print(f"Non-high claims: {(df['HighClaim'] == 0).sum():,} ({(df['HighClaim'] == 0).mean()*100:.2f}%)")


In [None]:

# %%
# Visualize data distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: TotalClaims distribution (log scale)
if 'TotalClaims' in df.columns:
    axes[0, 0].hist(df[df['TotalClaims'] > 0]['TotalClaims'], bins=50, edgecolor='black', alpha=0.7, color='skyblue')
    axes[0, 0].set_xlabel('Claim Amount ($)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Distribution of Claim Amounts (Claims > 0)')
    axes[0, 0].set_yscale('log')
    axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Claims vs No Claims
if 'TotalClaims' in df.columns:
    claim_status = ['No Claims', 'Has Claims']
    claim_counts = [(df['TotalClaims'] == 0).sum(), (df['TotalClaims'] > 0).sum()]
    colors = ['lightcoral', 'lightgreen']
    axes[0, 1].bar(claim_status, claim_counts, color=colors, edgecolor='black')
    axes[0, 1].set_ylabel('Number of Policies')
    axes[0, 1].set_title('Policies with vs without Claims')
    for i, v in enumerate(claim_counts):
        axes[0, 1].text(i, v + max(claim_counts)*0.01, f'{v:,}', ha='center', fontweight='bold')

# Plot 3: HighClaim distribution
if 'HighClaim' in df.columns:
    high_claim_counts = df['HighClaim'].value_counts()
    labels = ['Non-High Claim', 'High Claim']
    colors = ['lightblue', 'salmon']
    axes[1, 0].bar(labels, high_claim_counts.values, color=colors, edgecolor='black')
    axes[1, 0].set_ylabel('Number of Policies')
    axes[1, 0].set_title('High Claim Distribution')
    for i, v in enumerate(high_claim_counts.values):
        axes[1, 0].text(i, v + max(high_claim_counts.values)*0.01, f'{v:,}', ha='center', fontweight='bold')

# Plot 4: Log_TotalClaims distribution (if exists)
if 'Log_TotalClaims' in df.columns and (df['TotalClaims'] > 0).sum() > 0:
    axes[1, 1].hist(df[df['TotalClaims'] > 0]['Log_TotalClaims'], bins=30, edgecolor='black', alpha=0.7, color='gold')
    axes[1, 1].set_xlabel('Log(TotalClaims)')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Distribution of Log-Transformed Claims')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:

# %% [markdown]
# ### 2. Import and Use the Modeling Module

# %%
# Add src directory to Python path
sys.path.append(str(SRC_DIR))

# Try to import the modeling module
try:
    from src.modelling.modeling import(
        load_data, build_preprocessor,
        train_regression_models, train_classification_models,
        run_pipeline, select_features
    )
    print("‚úÖ Successfully imported modeling module")
    
    # You can now use functions from your modeling module
    # For example, load data using your function:
    df_from_module = load_data(str(DATA_PATH))
    print(f"Loaded data shape via module: {df_from_module.shape}")
    
except ImportError as e:
    print(f"‚ùå Could not import modeling module: {e}")
    print("\nMake sure you're running from the correct directory or")
    print("the module structure is correct.")

# %% [markdown]
# ### 3. Load and Analyze Model Results

# %%
# Load metadata
metadata_path = MODELS_DIR / "model_metadata.json"
if metadata_path.exists():
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    print("Model Metadata:")
    print(json.dumps(metadata, indent=2))
else:
    metadata = {}
    print("No metadata file found")

# %%
# Load all available models
print("\nLoading saved models...")

# Load regression models
reg_models = {}
if REGRESSION_DIR.exists():
    print(f"\nRegression models in {REGRESSION_DIR}:")
    for pkl_file in REGRESSION_DIR.glob("*.pkl"):
        try:
            model = joblib.load(pkl_file)
            model_name = pkl_file.stem.replace('reg_', '')
            reg_models[model_name] = {
                'model': model,
                'path': pkl_file,
                'type': 'regression'
            }
            print(f"  ‚úì {model_name}")
        except Exception as e:
            print(f"  ‚úó Error loading {pkl_file.name}: {e}")

# Load classification models
clf_models = {}
if CLASSIFICATION_DIR.exists():
    print(f"\nClassification models in {CLASSIFICATION_DIR}:")
    for pkl_file in CLASSIFICATION_DIR.glob("*.pkl"):
        try:
            model = joblib.load(pkl_file)
            model_name = pkl_file.stem.replace('clf_', '')
            clf_models[model_name] = {
                'model': model,
                'path': pkl_file,
                'type': 'classification'
            }
            print(f"  ‚úì {model_name}")
        except Exception as e:
            print(f"  ‚úó Error loading {pkl_file.name}: {e}")

# Load other models from root models directory
other_models = {}
print(f"\nOther models in {MODELS_DIR}:")
for pkl_file in MODELS_DIR.glob("*.pkl"):
    if pkl_file.parent == MODELS_DIR:  # Only root level
        try:
            model = joblib.load(pkl_file)
            model_name = pkl_file.stem
            other_models[model_name] = {
                'model': model,
                'path': pkl_file,
                'type': 'other'
            }
            print(f"  ‚úì {model_name}")
        except Exception as e:
            print(f"  ‚úó Error loading {pkl_file.name}: {e}")

# %%
# Performance metrics from your modeling.py output
regression_performance = {
    'linear': {'rmse': 1.0311, 'r2': 0.6068},
    'rf': {'rmse': 1.0046, 'r2': 0.6268},
    'xgb': {'rmse': 1.0730, 'r2': 0.5742}
}

classification_performance = {
    'logistic': {'auc': 0.9953, 'accuracy': 0.9975},
    'rf': {'auc': 0.9891, 'accuracy': 0.9982},
    'xgb': {'auc': 0.9697, 'accuracy': 0.9987}
}

# Visualize performance
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Regression RMSE
models_reg = list(regression_performance.keys())
rmse_values = [regression_performance[m]['rmse'] for m in models_reg]
bars1 = axes[0, 0].bar(models_reg, rmse_values, color=['#1f77b4', '#2ca02c', '#ff7f0e'], edgecolor='black')
axes[0, 0].set_ylabel('RMSE')
axes[0, 0].set_title('Regression Models: RMSE\n(Lower is Better)')
axes[0, 0].grid(True, alpha=0.3, axis='y')
for bar, val in zip(bars1, rmse_values):
    axes[0, 0].text(bar.get_x() + bar.get_width()/2., bar.get_height() * 1.02,
                   f'{val:.4f}', ha='center', va='bottom', fontweight='bold')

# Regression R¬≤
r2_values = [regression_performance[m]['r2'] for m in models_reg]
bars2 = axes[0, 1].bar(models_reg, r2_values, color=['#1f77b4', '#2ca02c', '#ff7f0e'], edgecolor='black')
axes[0, 1].set_ylabel('R¬≤ Score')
axes[0, 1].set_title('Regression Models: R¬≤ Score\n(Higher is Better)')
axes[0, 1].set_ylim(0, 1)
axes[0, 1].grid(True, alpha=0.3, axis='y')
for bar, val in zip(bars2, r2_values):
    axes[0, 1].text(bar.get_x() + bar.get_width()/2., bar.get_height() * 1.02,
                   f'{val:.4f}', ha='center', va='bottom', fontweight='bold')

# Classification AUC
models_clf = list(classification_performance.keys())
auc_values = [classification_performance[m]['auc'] for m in models_clf]
bars3 = axes[1, 0].bar(models_clf, auc_values, color=['#d62728', '#9467bd', '#8c564b'], edgecolor='black')
axes[1, 0].set_ylabel('AUC Score')
axes[1, 0].set_title('Classification Models: AUC\n(Higher is Better)')
axes[1, 0].set_ylim(0.9, 1.0)
axes[1, 0].grid(True, alpha=0.3, axis='y')
for bar, val in zip(bars3, auc_values):
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., bar.get_height() * 1.001,
                   f'{val:.4f}', ha='center', va='bottom', fontweight='bold')

# Classification Accuracy
acc_values = [classification_performance[m]['accuracy'] for m in models_clf]
bars4 = axes[1, 1].bar(models_clf, acc_values, color=['#d62728', '#9467bd', '#8c564b'], edgecolor='black')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].set_title('Classification Models: Accuracy\n(Higher is Better)')
axes[1, 1].set_ylim(0.99, 1.0)
axes[1, 1].grid(True, alpha=0.3, axis='y')
for bar, val in zip(bars4, acc_values):
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., bar.get_height() * 1.0005,
                   f'{val:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:

# %% [markdown]
# ### 4. Feature Importance Analysis

# %%
def extract_feature_importance(model_dict, model_name):
    """Extract feature importance from a model."""
    model_info = model_dict.get(model_name)
    if not model_info:
        return None
    
    model = model_info['model']
    
    # Handle pipeline
    if hasattr(model, 'named_steps'):
        # Get the estimator (last step)
        estimator_name = list(model.named_steps.keys())[-1]
        estimator = model.named_steps[estimator_name]
        
        # Get feature names from preprocessor if available
        if 'pre' in model.named_steps:
            preprocessor = model.named_steps['pre']
            try:
                feature_names = preprocessor.get_feature_names_out()
            except:
                feature_names = None
    else:
        estimator = model
        feature_names = None
    
    # Extract importance based on model type
    if hasattr(estimator, 'feature_importances_'):
        importance = estimator.feature_importances_
    elif hasattr(estimator, 'coef_'):
        importance = np.abs(estimator.coef_.flatten())
    else:
        return None
    
    return {
        'importance': importance,
        'feature_names': feature_names,
        'model_type': model_info['type']
    }

# Extract feature importance from Random Forest regression
rf_importance = extract_feature_importance(reg_models, 'rf')

if rf_importance and rf_importance['feature_names'] is not None:
    # Create DataFrame
    feat_df = pd.DataFrame({
        'feature': rf_importance['feature_names'],
        'importance': rf_importance['importance']
    }).sort_values('importance', ascending=False).head(20)
    
    # Plot
    plt.figure(figsize=(12, 8))
    bars = plt.barh(feat_df['feature'][::-1], feat_df['importance'][::-1], color='teal')
    plt.xlabel('Feature Importance')
    plt.title('Top 20 Features for Claim Severity Prediction\n(Random Forest Regression)')
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Features for Claim Severity Prediction:")
    print(feat_df.head(10).to_string(index=False))

# Extract feature importance from Random Forest classification
if 'rf' in clf_models:
    rf_clf_importance = extract_feature_importance(clf_models, 'rf')
    
    if rf_clf_importance and rf_clf_importance['feature_names'] is not None:
        # Create DataFrame
        feat_df = pd.DataFrame({
            'feature': rf_clf_importance['feature_names'],
            'importance': rf_clf_importance['importance']
        }).sort_values('importance', ascending=False).head(20)
        
        # Plot
        plt.figure(figsize=(12, 8))
        bars = plt.barh(feat_df['feature'][::-1], feat_df['importance'][::-1], color='coral')
        plt.xlabel('Feature Importance')
        plt.title('Top 20 Features for Claim Probability Prediction\n(Random Forest Classification)')
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.show()
        
        print("Top 10 Features for Claim Probability Prediction:")
        print(feat_df.head(10).to_string(index=False))


In [None]:

# %% [markdown]
# ### 5. SHAP Analysis for Model Interpretability

# %%
# Check SHAP availability
try:
    import shap
    HAS_SHAP = True
    print(f"‚úÖ SHAP version: {shap.__version__}")
except ImportError:
    HAS_SHAP = False
    print("‚ùå SHAP not installed. Install with: pip install shap")

# Helper function to convert numpy types to JSON-serializable
def convert_to_serializable(obj):
    """Convert numpy/pandas objects to JSON-serializable types."""
    if isinstance(obj, (np.integer, np.int64, np.int32, np.int16, np.int8)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, pd.Series):
        return obj.tolist()
    elif isinstance(obj, pd.DataFrame):
        return obj.to_dict()
    elif isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    elif pd.isna(obj):
        return None
    else:
        return obj

# %%
# SIMPLE SHAP FOR REGRESSION
if HAS_SHAP and 'rf' in reg_models:
    print("\n" + "=" * 80)
    print("SHAP ANALYSIS - REGRESSION MODEL")
    print("=" * 80)
    
    # Load the model
    rf_model = reg_models['rf']['model']
    
    # Prepare small sample
    df_sev = df[df['TotalClaims'] > 0].copy()
    X_sev = df_sev.drop(columns=['TotalClaims', 'Log_TotalClaims', 'Sqrt_TotalClaims',
                                 'Std_TotalClaims', 'HighClaim', 'ClaimSeverityCategory', 
                                 'StratifyBin', 'PolicyID'], errors='ignore')
    
    # Take very small sample for stability
    sample_size = min(50, len(X_sev))
    X_sample = X_sev.sample(sample_size, random_state=42)
    
    print(f"Using {sample_size} samples for SHAP analysis")
    
    # Extract estimator from pipeline
    if hasattr(rf_model, 'named_steps'):
        # Get the RandomForest estimator
        estimator = rf_model.named_steps.get('rf')
        if estimator is None:
            # Get the last estimator
            estimator_name = list(rf_model.named_steps.keys())[-1]
            estimator = rf_model.named_steps[estimator_name]
    else:
        estimator = rf_model
    
    # Convert categorical columns to numeric for SHAP
    X_sample_numeric = X_sample.copy()
    for col in X_sample_numeric.select_dtypes(include=['object', 'category']).columns:
        X_sample_numeric[col] = pd.factorize(X_sample_numeric[col])[0]
    
    print(f"Processed data shape: {X_sample_numeric.shape}")
    
    # Create SHAP explainer
    try:
        explainer = shap.TreeExplainer(estimator)
        
        # Calculate SHAP values
        shap_values = explainer.shap_values(X_sample_numeric.values)
        
        # Get expected value
        if isinstance(explainer.expected_value, np.ndarray):
            expected_value = float(explainer.expected_value[0])
        else:
            expected_value = float(explainer.expected_value)
        
        print(f"SHAP calculation successful!")
        print(f"Expected value: {expected_value:.4f}")
        
        # Create simple feature importance plot
        shap_importance = np.abs(shap_values).mean(0)
        top_indices = np.argsort(shap_importance)[-10:][::-1]
        
        # Create manual bar plot
        plt.figure(figsize=(12, 6))
        top_features = [X_sample.columns[i] for i in top_indices[:10]]
        top_importance = shap_importance[top_indices[:10]]
        
        bars = plt.barh(top_features[::-1], top_importance[::-1], color='teal')
        plt.xlabel('Mean Absolute SHAP Value')
        plt.title('Top 10 Feature Importance (SHAP) - Regression Model')
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.savefig(REPORTS_DIR / "shap_regression_importance.png", dpi=150, bbox_inches='tight')
        plt.show()
        
        # Save results
        results = {
            'top_features': top_features,
            'shap_importance': top_importance.tolist(),
            'expected_value': expected_value,
            'sample_size': sample_size,
            'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
        }
        
        # Create results DataFrame
        results_df = pd.DataFrame({
            'Feature': top_features,
            'SHAP_Importance': top_importance
        })
        
        print("\nüìä TOP 10 FEATURES BY SHAP IMPORTANCE (Regression):")
        print(results_df.to_string(index=False))
        
        # Save to files
        results_df.to_csv(REPORTS_DIR / "shap_regression_results.csv", index=False)
        with open(REPORTS_DIR / "shap_regression_results.json", 'w') as f:
            json.dump(convert_to_serializable(results), f, indent=2)
        
        print(f"\n‚úÖ Results saved to {REPORTS_DIR}")
        
    except Exception as e:
        print(f"‚ùå SHAP calculation failed: {e}")
else:
    print("‚ùå SHAP not available or regression model not found")


In [None]:

# %%
# TRADITIONAL FEATURE IMPORTANCE FOR CLASSIFICATION
print("\n" + "=" * 80)
print("FEATURE IMPORTANCE - CLASSIFICATION MODEL")
print("=" * 80)

if 'rf' in clf_models:
    # Get the model
    clf_model = clf_models['rf']['model']
    
    # Extract RandomForest estimator
    if hasattr(clf_model, 'named_steps'):
        estimator = clf_model.named_steps.get('rf')
        if estimator is None:
            estimator_name = list(clf_model.named_steps.keys())[-1]
            estimator = clf_model.named_steps[estimator_name]
    else:
        estimator = clf_model
    
    # Get traditional feature importance
    if hasattr(estimator, 'feature_importances_'):
        # We need feature names - let's get them from the data
        X_clf = df.drop(columns=['TotalClaims', 'Log_TotalClaims', 'Sqrt_TotalClaims',
                                 'Std_TotalClaims', 'HighClaim', 'ClaimSeverityCategory',
                                 'StratifyBin', 'PolicyID'], errors='ignore')
        
        # Take a small sample
        X_sample_clf = X_clf.sample(min(100, len(X_clf)), random_state=42)
        
        # Convert categorical to numeric
        for col in X_sample_clf.select_dtypes(include=['object', 'category']).columns:
            X_sample_clf[col] = pd.factorize(X_sample_clf[col])[0]
        
        feature_importance = estimator.feature_importances_
        
        # Match with feature names
        if len(feature_importance) == len(X_sample_clf.columns):
            feature_names = X_sample_clf.columns.tolist()
        else:
            feature_names = [f"feature_{i}" for i in range(len(feature_importance))]
        
        # Get top features
        top_indices = np.argsort(feature_importance)[-10:][::-1]
        top_features = [feature_names[i] for i in top_indices[:10]]
        top_importance = feature_importance[top_indices[:10]]
        
        # Plot
        plt.figure(figsize=(12, 6))
        bars = plt.barh(top_features[::-1], top_importance[::-1], color='coral')
        plt.xlabel('Feature Importance Score')
        plt.title('Top 10 Feature Importance - Classification Model')
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.savefig(REPORTS_DIR / "classification_feature_importance.png", dpi=150, bbox_inches='tight')
        plt.show()
        
        # Save results
        results_df = pd.DataFrame({
            'Feature': top_features,
            'Importance': top_importance
        })
        
        print("\nüìä TOP 10 FEATURES BY TRADITIONAL IMPORTANCE (Classification):")
        print(results_df.to_string(index=False))
        
        results_df.to_csv(REPORTS_DIR / "classification_feature_importance.csv", index=False)
        
        print(f"\n‚úÖ Results saved to {REPORTS_DIR}")
        
    else:
        print("‚ùå Model doesn't have feature_importances_ attribute")
else:
    print("‚ùå Classification model not found")

# %% [markdown]
# ### 6. Business Implications: Premium Calculation

# %%
def calculate_premium(p_claim, expected_severity, expense_ratio=0.15, profit_margin=0.10):
    """Calculate risk-based premium."""
    pure_premium = p_claim * expected_severity
    premium_with_expenses = pure_premium * (1 + expense_ratio)
    final_premium = premium_with_expenses * (1 + profit_margin)
    
    return {
        'pure_premium': pure_premium,
        'expense_loading': premium_with_expenses - pure_premium,
        'profit_margin_amount': final_premium - premium_with_expenses,
        'final_premium': final_premium
    }

# Example risk profiles
risk_profiles = [
    {"name": "Very Low Risk", "p_claim": 0.0005, "severity": 1000},
    {"name": "Low Risk", "p_claim": 0.001, "severity": 2000},
    {"name": "Medium Risk", "p_claim": 0.01, "severity": 5000},
    {"name": "High Risk", "p_claim": 0.05, "severity": 15000},
    {"name": "Very High Risk", "p_claim": 0.10, "severity": 30000}
]

print("RISK-BASED PREMIUM CALCULATION EXAMPLES")
print("=" * 70)

for profile in risk_profiles:
    premium = calculate_premium(profile['p_claim'], profile['severity'])
    print(f"\n{profile['name']}:")
    print(f"  ‚Ä¢ Claim Probability: {profile['p_claim']*100:.3f}%")
    print(f"  ‚Ä¢ Expected Severity: ${profile['severity']:,.2f}")
    print(f"  ‚Ä¢ Pure Premium: ${premium['pure_premium']:,.2f}")
    print(f"  ‚Ä¢ + Expenses: ${premium['expense_loading']:,.2f}")
    print(f"  ‚Ä¢ + Profit: ${premium['profit_margin_amount']:,.2f}")
    print(f"  ‚Ä¢ FINAL PREMIUM: ${premium['final_premium']:,.2f}")

# %%
# Visualize premium components
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Premium breakdown for medium risk
medium_premium = calculate_premium(0.01, 5000)
components = ['Pure Risk', 'Expenses (15%)', 'Profit (10%)']
values = [
    medium_premium['pure_premium'],
    medium_premium['expense_loading'],
    medium_premium['profit_margin_amount']
]
colors = ['lightblue', 'lightgreen', 'gold']

wedges, texts, autotexts = axes[0].pie(
    values, labels=components, colors=colors, autopct='%1.1f%%',
    startangle=90, explode=(0.05, 0.05, 0.05)
)
axes[0].set_title('Premium Composition\n(Medium Risk Policy)')

# Premium comparison across risk levels
risk_names = [p['name'] for p in risk_profiles]
premiums = [calculate_premium(p['p_claim'], p['severity'])['final_premium'] for p in risk_profiles]

bars = axes[1].bar(risk_names, premiums, color=['lightgreen', 'lightblue', 'gold', 'orange', 'salmon'], edgecolor='black')
axes[1].set_ylabel('Annual Premium ($)')
axes[1].set_title('Premium Comparison Across Risk Profiles')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, premium in zip(bars, premiums):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height * 1.02,
                f'${premium:,.0f}', ha='center', va='bottom', fontweight='bold', fontsize=9)

plt.tight_layout()
plt.savefig(REPORTS_DIR / "premium_calculations.png", dpi=150, bbox_inches='tight')
plt.show()


In [None]:

# %% [markdown]
# ### 7. Generate Comprehensive Report

# %%
# Create analysis report
analysis_report = {
    "analysis_date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
    "project_info": {
        "project_root": str(PROJECT_ROOT),
        "data_file": str(DATA_PATH),
        "dataset_shape": {
            "rows": int(df.shape[0]),
            "columns": int(df.shape[1])
        },
        "total_policies": int(len(df)),
        "policies_with_claims": int((df['TotalClaims'] > 0).sum()) if 'TotalClaims' in df.columns else None,
        "high_claim_policies": int(df['HighClaim'].sum()) if 'HighClaim' in df.columns else None
    },
    "model_performance": {
        "regression": regression_performance,
        "classification": classification_performance
    },
    "best_models": {
        "regression": {
            "name": "Random Forest",
            "r2": regression_performance['rf']['r2'],
            "rmse": regression_performance['rf']['rmse']
        },
        "classification": {
            "name": "Logistic Regression",
            "auc": classification_performance['logistic']['auc'],
            "accuracy": classification_performance['logistic']['accuracy']
        }
    },
    "key_findings": [
        "Random Forest performed best for claim severity prediction (R¬≤=0.6268)",
        "All classification models showed excellent performance (AUC > 0.96)",
        "Data leakage detected: StratifyBin was used in classification training",
        "Logistic regression showed convergence warning (needs more iterations)"
    ],
    "critical_issues": [
        "DATA LEAKAGE: StratifyBin must be removed from classification model training",
        "SUSPICIOUS METRICS: Classification AUC > 0.99 suggests potential data leakage",
        "CATBOOST FAILED: NaN handling issues in categorical features"
    ],
    "recommendations": [
        "Retrain classification models without StratifyBin leakage feature",
        "Use Random Forest for production claim severity prediction",
        "Investigate potential data leakage in classification models",
        "Fix NaN handling for CatBoost compatibility",
        "Implement cross-validation for more robust performance estimates"
    ],
    "business_implications": [
        "Risk-based pricing can be implemented using model predictions",
        "High-risk profiles can be identified for targeted underwriting",
        "Premium optimization opportunities exist through better risk assessment",
        "Coverage amount validation is critical for claim severity management"
    ]
}

# Save report
with open(REPORTS_DIR / "modeling_analysis_report.json", 'w') as f:
    json.dump(convert_to_serializable(analysis_report), f, indent=2)

# Also save a markdown version for easy reading
md_report = f"""# Modeling Analysis Report

**Date:** {analysis_report['analysis_date']}
**Project:** {analysis_report['project_info']['project_root']}

## Dataset Overview
- Total policies: {analysis_report['project_info']['total_policies']:,}
- Policies with claims: {analysis_report['project_info']['policies_with_claims']:,}
- High claim policies: {analysis_report['project_info']['high_claim_policies']:,}

## Best Performing Models

### Regression (Claim Severity)
**Model:** {analysis_report['best_models']['regression']['name']}
- R¬≤ Score: {analysis_report['best_models']['regression']['r2']:.4f}
- RMSE: {analysis_report['best_models']['regression']['rmse']:.4f}

### Classification (Claim Probability)
**Model:** {analysis_report['best_models']['classification']['name']}
- AUC: {analysis_report['best_models']['classification']['auc']:.4f}
- Accuracy: {analysis_report['best_models']['classification']['accuracy']:.4f}

## ‚ö†Ô∏è CRITICAL ISSUES
{chr(10).join(['- ' + issue for issue in analysis_report['critical_issues']])}

## Key Findings
{chr(10).join(['- ' + finding for finding in analysis_report['key_findings']])}

## Recommendations
{chr(10).join(['- ' + rec for rec in analysis_report['recommendations']])}

## Business Implications
{chr(10).join(['- ' + imp for imp in analysis_report['business_implications']])}

## Next Steps Priority
1. **IMMEDIATE**: Fix data leakage (remove StratifyBin) and retrain models
2. **SHORT-TERM**: Validate classification model performance after leakage fix
3. **MEDIUM-TERM**: Implement production monitoring for model drift
4. **LONG-TERM**: Develop automated risk assessment dashboard

## Files Generated
All analysis files are available in: {REPORTS_DIR}
"""

md_report_path = REPORTS_DIR / "modeling_analysis_report.md"
with open(md_report_path, 'w') as f:
    f.write(md_report)

print(f"\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

print(f"\nüìä Key Insights:")
print(f"1. Best regression model: Random Forest (R¬≤ = {regression_performance['rf']['r2']:.4f})")
print(f"2. Best classification model: Logistic Regression (AUC = {classification_performance['logistic']['auc']:.4f})")
print(f"3. Dataset: {len(df):,} policies, {(df['TotalClaims'] > 0).sum():,} with claims")

print(f"\n‚ö†Ô∏è  Critical Issues to Address:")
print("1. DATA LEAKAGE: StratifyBin in classification model")
print("2. Suspiciously high classification AUC (> 0.99)")
print("3. CatBoost compatibility issues")

print(f"\n‚úÖ Next Steps:")
print("1. Fix data leakage and retrain models")
print("2. Validate model performance after fixes")
print("3. Implement risk-based pricing model")
print("4. Monitor model performance in production")

print(f"\nüìÅ Reports saved in: {REPORTS_DIR}")
print("="*80)