In [None]:
# %% [markdown]
# # Insurance Claim Severity Modeling - EDA & Results Dashboard
# 
# This notebook provides a comprehensive overview of the insurance claim severity modeling pipeline, including data exploration, model results, and insights.
# 

# %% [markdown]
# ## 1. Setup and Configuration

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import warnings
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
import sys
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns

# Set up paths
BASE_PATH = Path.cwd().parent
DATA_PATH = BASE_PATH / "data"
MODELS_PATH = BASE_PATH / "models"
RESULTS_PATH = BASE_PATH / "Results"
NOTEBOOKS_PATH = BASE_PATH / "notebooks"

# Add src to path
sys.path.append(str(BASE_PATH / "src"))

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

warnings.filterwarnings('ignore')

print("Setup complete!")
print(f"Base path: {BASE_PATH}")

# %% [markdown]
# ## 2. Load Data

# %%
def load_data():
    """Load all available datasets"""
    data_dict = {}
    
    # Load processed data
    try:
        processed_path = DATA_PATH / "processed"
        files = list(processed_path.glob("*.csv"))
        
        for file in files:
            name = file.stem
            print(f"Loading {name}...")
            try:
                data_dict[name] = pd.read_csv(file)
                print(f"  Shape: {data_dict[name].shape}")
            except Exception as e:
                print(f"  Error loading {file}: {e}")
                
    except Exception as e:
        print(f"Error loading processed data: {e}")
    
    return data_dict

# Load the data
print("Loading datasets...")
data_dict = load_data()

# Display available datasets
print("\nAvailable datasets:")
for name, df in data_dict.items():
    print(f"  - {name}: {df.shape}")


In [None]:

# %% [markdown]
# ## 3. Data Exploration

# %%
if 'claim_policies' in data_dict:
    df = data_dict['claim_policies'].copy()
    
    print("="*60)
    print("DATA EXPLORATION")
    print("="*60)
    
    # Basic info
    print(f"\nüìä Dataset Shape: {df.shape}")
    print(f"üìã Columns: {len(df.columns)}")
    
    # Display first few rows
    print("\nFirst 5 rows:")
    display(df.head())
    
    # Data types
    print("\nData Types:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"  {dtype}: {count} columns")
    
    # Missing values
    print("\nMissing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).round(2)
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing %': missing_pct
    })
    missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
    
    if len(missing_df) > 0:
        display(missing_df)
    else:
        print("  No missing values found!")
    
    # Target variable analysis
    if 'TotalClaims' in df.columns:
        print("\nüéØ Target Variable Analysis (TotalClaims):")
        target_stats = df['TotalClaims'].describe()
        display(pd.DataFrame(target_stats).T)
        
        # Plot target distribution
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Histogram
        axes[0].hist(df['TotalClaims'], bins=50, edgecolor='black', alpha=0.7)
        axes[0].axvline(df['TotalClaims'].mean(), color='red', linestyle='--', 
                       label=f'Mean: R{df["TotalClaims"].mean():,.2f}')
        axes[0].axvline(df['TotalClaims'].median(), color='green', linestyle='--',
                       label=f'Median: R{df["TotalClaims"].median():,.2f}')
        axes[0].set_xlabel('Claim Amount (R)')
        axes[0].set_ylabel('Frequency')
        axes[0].set_title('Distribution of Claim Amounts')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Log transformation
        if (df['TotalClaims'] > 0).all():
            log_claims = np.log1p(df['TotalClaims'])
            axes[1].hist(log_claims, bins=50, edgecolor='black', alpha=0.7)
            axes[1].set_xlabel('Log(1 + Claim Amount)')
            axes[1].set_ylabel('Frequency')
            axes[1].set_title('Log-Transformed Claim Amounts')
            axes[1].grid(True, alpha=0.3)
        else:
            axes[1].text(0.5, 0.5, 'Log transform not possible\n(negative values present)',
                        ha='center', va='center', transform=axes[1].transAxes)
            axes[1].set_title('Log-Transformed Claim Amounts')
        
        plt.tight_layout()
        plt.show()
        
        # Top claims
        print("\nüí∞ Top 10 Largest Claims:")
        top_claims = df.nlargest(10, 'TotalClaims')[['PolicyID', 'TotalClaims']].copy()
        top_claims['TotalClaims'] = top_claims['TotalClaims'].apply(lambda x: f'R{x:,.2f}')
        display(top_claims)


In [None]:

# %% [markdown]
# ## 4. Load Model Results

# %%
def load_model_results():
    """Load model evaluation results"""
    results = {}
    
    # Load model comparison
    model_comp_path = MODELS_PATH / "model_comparison.json"
    if model_comp_path.exists():
        try:
            with open(model_comp_path, 'r') as f:
                model_data = json.load(f)
            
            # Check the structure of model_data
            print(f"Model comparison data type: {type(model_data)}")
            
            if isinstance(model_data, dict):
                print(f"Model comparison keys: {list(model_data.keys())}")
                
                # Handle the specific structure you have
                if 'model_comparison' in model_data:
                    print("Found 'model_comparison' key - extracting nested data")
                    results['model_comparison'] = model_data['model_comparison']
                    
                    if 'detailed_metrics' in model_data:
                        results['detailed_metrics'] = model_data['detailed_metrics']
                    
                    if 'best_model' in model_data:
                        results['best_model'] = model_data['best_model']
                else:
                    # It's already in the right format
                    results['model_comparison'] = model_data
                
            print(f"‚úì Loaded model comparison results")
        except Exception as e:
            print(f"Error loading model comparison: {e}")
            import traceback
            traceback.print_exc()
    
    # Load cross-validation results
    cv_path = MODELS_PATH / "cross_validation_results.json"
    if cv_path.exists():
        with open(cv_path, 'r') as f:
            results['cv_results'] = json.load(f)
        print(f"‚úì Loaded cross-validation results")
    
    # Load Lasso best params
    lasso_params_path = MODELS_PATH / "Lasso_best_params.json"
    if lasso_params_path.exists():
        with open(lasso_params_path, 'r') as f:
            results['lasso_params'] = json.load(f)
        print(f"‚úì Loaded Lasso parameters")
    
    # Load Linear Regression best params
    lr_params_path = MODELS_PATH / "LinearRegression_best_params.json"
    if lr_params_path.exists():
        with open(lr_params_path, 'r') as f:
            results['lr_params'] = json.load(f)
        print(f"‚úì Loaded Linear Regression parameters")
    
    # Load task 4 reports
    task4_json_path = RESULTS_PATH / "Task4_Reports" / "task4_comprehensive_report.json"
    if task4_json_path.exists():
        with open(task4_json_path, 'r') as f:
            results['task4_report'] = json.load(f)
        print(f"‚úì Loaded Task 4 comprehensive report")
    
    task4_md_path = RESULTS_PATH / "Task4_Reports" / "task4_final_report.md"
    if task4_md_path.exists():
        with open(task4_md_path, 'r') as f:
            results['task4_md'] = f.read()
        print(f"‚úì Loaded Task 4 markdown report")
    
    return results

print("Loading model results...")
model_results = load_model_results()

# Display what we loaded
print("\nüìä Loaded model results:")
for key, value in model_results.items():
    if key != 'task4_md':  # Don't print large markdown content
        print(f"  {key}: {type(value)}")


In [None]:

# %% [markdown]
# ## 5. Model Performance Analysis

# %%
if 'model_comparison' in model_results:
    print("="*60)
    print("MODEL PERFORMANCE COMPARISON")
    print("="*60)
    
    model_data = model_results['model_comparison']
    print(f"Model comparison data type: {type(model_data)}")
    
    if isinstance(model_data, dict):
        print(f"Number of models: {len(model_data)}")
        print(f"Model names: {list(model_data.keys())}")
        
        # Let's see the structure of the first model's data
        if model_data:
            first_model = list(model_data.keys())[0]
            print(f"\nFirst model '{first_model}' data structure:")
            print(f"  Type: {type(model_data[first_model])}")
            if isinstance(model_data[first_model], dict):
                print(f"  Keys: {list(model_data[first_model].keys())}")
                print(f"  Sample values:")
                for key, value in list(model_data[first_model].items())[:5]:
                    print(f"    {key}: {value}")
    
    # Extract metrics from the nested structure
    print("\nüìà Extracting model metrics...")
    
    # Create a list to store model metrics
    model_metrics = []
    
    if isinstance(model_data, dict):
        for model_name, metrics in model_data.items():
            if isinstance(metrics, dict):
                # Extract relevant metrics
                model_info = {
                    'Model': model_name,
                    'R2': metrics.get('r2', metrics.get('R2', np.nan)),
                    'MAE': metrics.get('mae', metrics.get('MAE', np.nan)),
                    'RMSE': metrics.get('rmse', metrics.get('RMSE', np.nan)),
                    'Training_Time': metrics.get('training_time', metrics.get('Training_Time', np.nan))
                }
                model_metrics.append(model_info)
    
    if model_metrics:
        # Create DataFrame
        comp_df = pd.DataFrame(model_metrics)
        
        # Set Model as index
        comp_df.set_index('Model', inplace=True)
        
        # Sort by R¬≤
        if 'R2' in comp_df.columns:
            comp_df = comp_df.sort_values('R2', ascending=False)
        
        print(f"\nModel comparison DataFrame shape: {comp_df.shape}")
        print(f"Columns: {comp_df.columns.tolist()}")
        
        print("\nüìä Model Performance Metrics:")
        display(comp_df)
        
        # Create visualization
        num_models = len(comp_df)
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # R¬≤ Score
        if 'R2' in comp_df.columns:
            axes[0, 0].barh(comp_df.index, comp_df['R2'], color='skyblue')
            axes[0, 0].set_xlabel('R¬≤ Score')
            axes[0, 0].set_title('Model R¬≤ Scores')
            axes[0, 0].grid(True, alpha=0.3, axis='x')
            
            # Add value labels
            for i, (idx, row) in enumerate(comp_df.iterrows()):
                axes[0, 0].text(row['R2'] + 0.01, i, f"{row['R2']:.3f}", 
                              va='center', fontsize=10)
        else:
            axes[0, 0].text(0.5, 0.5, 'R¬≤ data not available',
                          ha='center', va='center', transform=axes[0, 0].transAxes)
            axes[0, 0].set_title('R¬≤ Scores (Not Available)')
        
        # MAE
        if 'MAE' in comp_df.columns:
            axes[0, 1].barh(comp_df.index, comp_df['MAE'], color='lightcoral')
            axes[0, 1].set_xlabel('MAE (R)')
            axes[0, 1].set_title('Mean Absolute Error')
            axes[0, 1].grid(True, alpha=0.3, axis='x')
            
            # Add value labels
            for i, (idx, row) in enumerate(comp_df.iterrows()):
                axes[0, 1].text(row['MAE'] + comp_df['MAE'].max()*0.01, i, 
                              f"R{row['MAE']:,.2f}", va='center', fontsize=10)
        else:
            axes[0, 1].text(0.5, 0.5, 'MAE data not available',
                          ha='center', va='center', transform=axes[0, 1].transAxes)
            axes[0, 1].set_title('MAE (Not Available)')
        
        # RMSE
        if 'RMSE' in comp_df.columns:
            axes[1, 0].barh(comp_df.index, comp_df['RMSE'], color='lightgreen')
            axes[1, 0].set_xlabel('RMSE (R)')
            axes[1, 0].set_title('Root Mean Squared Error')
            axes[1, 0].grid(True, alpha=0.3, axis='x')
            
            # Add value labels
            for i, (idx, row) in enumerate(comp_df.iterrows()):
                axes[1, 0].text(row['RMSE'] + comp_df['RMSE'].max()*0.01, i, 
                              f"R{row['RMSE']:,.2f}", va='center', fontsize=10)
        else:
            axes[1, 0].text(0.5, 0.5, 'RMSE data not available',
                          ha='center', va='center', transform=axes[1, 0].transAxes)
            axes[1, 0].set_title('RMSE (Not Available)')
        
        # Training Time
        if 'Training_Time' in comp_df.columns:
            axes[1, 1].barh(comp_df.index, comp_df['Training_Time'], color='gold')
            axes[1, 1].set_xlabel('Training Time (seconds)')
            axes[1, 1].set_title('Model Training Time')
            axes[1, 1].grid(True, alpha=0.3, axis='x')
            
            # Add value labels
            for i, (idx, row) in enumerate(comp_df.iterrows()):
                axes[1, 1].text(row['Training_Time'] + comp_df['Training_Time'].max()*0.01, i, 
                              f"{row['Training_Time']:.2f}s", va='center', fontsize=10)
        else:
            axes[1, 1].text(0.5, 0.5, 'Training time data not available',
                          ha='center', va='center', transform=axes[1, 1].transAxes)
            axes[1, 1].set_title('Training Time (Not Available)')
        
        plt.tight_layout()
        plt.show()
        
        # Best model
        if 'R2' in comp_df.columns and not comp_df['R2'].isna().all():
            best_model = comp_df['R2'].idxmax()
            best_r2 = comp_df.loc[best_model, 'R2']
            print(f"\nüèÜ Best Performing Model: {best_model}")
            print(f"   R¬≤ Score: {best_r2:.4f}")
            
            if 'MAE' in comp_df.columns and not pd.isna(comp_df.loc[best_model, 'MAE']):
                print(f"   MAE: R{comp_df.loc[best_model, 'MAE']:,.2f}")
            
            if 'RMSE' in comp_df.columns and not pd.isna(comp_df.loc[best_model, 'RMSE']):
                print(f"   RMSE: R{comp_df.loc[best_model, 'RMSE']:,.2f}")
            
            if 'Training_Time' in comp_df.columns and not pd.isna(comp_df.loc[best_model, 'Training_Time']):
                print(f"   Training Time: {comp_df.loc[best_model, 'Training_Time']:.2f}s")
            
            # Also check if we have a 'best_model' key
            if 'best_model' in model_results:
                print(f"\nüìå Designated Best Model from results: {model_results['best_model']}")
        else:
            print("\n‚ö†Ô∏è Could not determine best model (R¬≤ scores not available)")
    else:
        print("\n‚ö†Ô∏è No model metrics found in the data")

# Check for detailed metrics
if 'detailed_metrics' in model_results:
    print("\n" + "="*60)
    print("DETAILED MODEL METRICS")
    print("="*60)
    
    detailed = model_results['detailed_metrics']
    if isinstance(detailed, dict):
        print("Detailed metrics available for models:")
        for model_name, metrics in detailed.items():
            print(f"\n  {model_name}:")
            if isinstance(metrics, dict):
                for key, value in list(metrics.items())[:5]:  # Show first 5
                    print(f"    {key}: {value}")
                if len(metrics) > 5:
                    print(f"    ... and {len(metrics) - 5} more metrics")


In [None]:

# %% [markdown]
# ## 6. Cross-Validation Results

# %%
if 'cv_results' in model_results:
    print("="*60)
    print("CROSS-VALIDATION RESULTS")
    print("="*60)
    
    cv_data = model_results['cv_results']
    
    # Check structure
    print(f"CV data type: {type(cv_data)}")
    
    if isinstance(cv_data, dict):
        # Create visualization
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # R¬≤ scores across folds
        plotted_r2 = False
        for model, scores in cv_data.items():
            if isinstance(scores, dict) and 'test_r2' in scores:
                if isinstance(scores['test_r2'], list):
                    axes[0].plot(range(1, len(scores['test_r2']) + 1), 
                                scores['test_r2'], 
                                marker='o', 
                                label=model,
                                linewidth=2)
                    plotted_r2 = True
        
        if plotted_r2:
            axes[0].set_xlabel('Fold Number')
            axes[0].set_ylabel('R¬≤ Score')
            axes[0].set_title('Cross-Validation R¬≤ Scores by Fold')
            axes[0].legend()
            axes[0].grid(True, alpha=0.3)
        else:
            axes[0].text(0.5, 0.5, 'R¬≤ scores not available',
                       ha='center', va='center', transform=axes[0].transAxes)
            axes[0].set_title('R¬≤ Scores (Not Available)')
        
        # RMSE scores across folds
        plotted_rmse = False
        for model, scores in cv_data.items():
            if isinstance(scores, dict) and 'test_rmse' in scores:
                if isinstance(scores['test_rmse'], list):
                    axes[1].plot(range(1, len(scores['test_rmse']) + 1), 
                                scores['test_rmse'], 
                                marker='s', 
                                label=model,
                                linewidth=2)
                    plotted_rmse = True
        
        if plotted_rmse:
            axes[1].set_xlabel('Fold Number')
            axes[1].set_ylabel('RMSE (R)')
            axes[1].set_title('Cross-Validation RMSE by Fold')
            axes[1].legend()
            axes[1].grid(True, alpha=0.3)
        else:
            axes[1].text(0.5, 0.5, 'RMSE scores not available',
                       ha='center', va='center', transform=axes[1].transAxes)
            axes[1].set_title('RMSE Scores (Not Available)')
        
        plt.tight_layout()
        plt.show()
        
        # Display summary statistics
        print("\nüìä Cross-Validation Summary Statistics:")
        summary_data = []
        
        for model, scores in cv_data.items():
            if isinstance(scores, dict) and 'test_r2' in scores:
                if isinstance(scores['test_r2'], list) and scores['test_r2']:
                    r2_mean = np.mean(scores['test_r2'])
                    r2_std = np.std(scores['test_r2'])
                    summary_data.append({
                        'Model': model,
                        'Mean R¬≤': f"{r2_mean:.4f}",
                        'Std R¬≤': f"{r2_std:.4f}",
                        'R¬≤ Range': f"{min(scores['test_r2']):.4f} - {max(scores['test_r2']):.4f}"
                    })
        
        if summary_data:
            summary_df = pd.DataFrame(summary_data)
            display(summary_df)
        else:
            print("No R¬≤ score data available for summary")
    else:
        print("CV data is not in expected dictionary format")

# %% [markdown]
# ## 7. Hyperparameter Tuning Results

# %%
print("="*60)
print("HYPERPARAMETER TUNING RESULTS")
print("="*60)

if 'lasso_params' in model_results:
    print("\nüîß Lasso Regression - Best Parameters:")
    lasso_params = model_results['lasso_params']
    if isinstance(lasso_params, dict):
        for param, value in lasso_params.items():
            print(f"  {param}: {value}")
    else:
        print(f"  Parameters (raw): {lasso_params}")

if 'lr_params' in model_results:
    print("\nüîß Linear Regression - Best Parameters:")
    lr_params = model_results['lr_params']
    if isinstance(lr_params, dict):
        for param, value in lr_params.items():
            print(f"  {param}: {value}")
    else:
        print(f"  Parameters (raw): {lr_params}")

# %% [markdown]
# ## 8. Feature Importance Analysis

# %%
def analyze_feature_importance():
    """Analyze feature importance from models"""
    print("="*60)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("="*60)
    
    # Try to load tree-based models for feature importance
    tree_models = ['RandomForest', 'GradientBoosting', 'XGBoost', 'LightGBM', 'DecisionTree']
    
    for model_name in tree_models:
        model_path = MODELS_PATH / f"{model_name}.pkl"
        if model_path.exists():
            try:
                with open(model_path, 'rb') as f:
                    model = pickle.load(f)
                
                if hasattr(model, 'feature_importances_'):
                    print(f"\nüå≥ {model_name} Feature Importance (Top 10):")
                    
                    # Try to load feature names
                    try:
                        # Load preprocessor to get feature names
                        preprocessor_path = MODELS_PATH / "preprocessor.pkl"
                        if preprocessor_path.exists():
                            with open(preprocessor_path, 'rb') as f:
                                preprocessor = pickle.load(f)
                            
                            # Get feature names from preprocessor
                            if hasattr(preprocessor, 'get_feature_names_out'):
                                feature_names = preprocessor.get_feature_names_out()
                            else:
                                # Try to get from training data
                                train_path = DATA_PATH / "processed" / "train_data.csv"
                                if train_path.exists():
                                    train_df = pd.read_csv(train_path)
                                    feature_names = [col for col in train_df.columns 
                                                   if col not in ['Log_TotalClaims', 'TotalClaims', 'HighClaim']]
                                else:
                                    feature_names = [f"feature_{i}" for i in range(len(model.feature_importances_))]
                        else:
                            feature_names = [f"feature_{i}" for i in range(len(model.feature_importances_))]
                    
                    except Exception as e:
                        print(f"  Warning: Could not get feature names: {e}")
                        feature_names = [f"feature_{i}" for i in range(len(model.feature_importances_))]
                    
                    # Create importance DataFrame
                    importance_df = pd.DataFrame({
                        'feature': feature_names[:len(model.feature_importances_)],
                        'importance': model.feature_importances_
                    }).sort_values('importance', ascending=False)
                    
                    # Display top 10
                    display(importance_df.head(10))
                    
                    # Plot
                    fig, ax = plt.subplots(figsize=(12, 6))
                    top_10 = importance_df.head(10).sort_values('importance')
                    ax.barh(top_10['feature'], top_10['importance'], color='teal')
                    ax.set_xlabel('Importance Score')
                    ax.set_title(f'{model_name} - Top 10 Feature Importances')
                    ax.grid(True, alpha=0.3, axis='x')
                    plt.tight_layout()
                    plt.show()
                    
            except Exception as e:
                print(f"Error loading {model_name}: {e}")
        else:
            print(f"  {model_name}.pkl not found")

# Run feature importance analysis
analyze_feature_importance()


In [None]:

# %% [markdown]
# ## 9. Task 4 Insights

# %%
if 'task4_report' in model_results:
    print("="*60)
    print("TASK 4: COMPREHENSIVE INSIGHTS")
    print("="*60)
    
    report = model_results['task4_report']
    
    # Display key insights
    if 'key_insights' in report:
        print("\nüîç Key Insights:")
        for i, insight in enumerate(report['key_insights'], 1):
            print(f"  {i}. {insight}")
    elif 'insights' in report:
        print("\nüîç Key Insights:")
        if isinstance(report['insights'], list):
            for i, insight in enumerate(report['insights'], 1):
                print(f"  {i}. {insight}")
        else:
            print(f"  {report['insights']}")
    
    if 'recommendations' in report:
        print("\nüí° Business Recommendations:")
        for i, rec in enumerate(report['recommendations'], 1):
            print(f"  {i}. {rec}")
    
    if 'limitations' in report:
        print("\n‚ö†Ô∏è Limitations:")
        for i, limit in enumerate(report['limitations'], 1):
            print(f"  {i}. {limit}")
    
    # Display technical metrics
    if 'technical_summary' in report:
        print("\nüìä Technical Summary:")
        tech_summary = report['technical_summary']
        if isinstance(tech_summary, dict):
            for key, value in tech_summary.items():
                if isinstance(value, dict):
                    print(f"\n  {key}:")
                    for sub_key, sub_value in value.items():
                        print(f"    {sub_key}: {sub_value}")
                else:
                    print(f"  {key}: {value}")
        else:
            print(f"  {tech_summary}")


In [None]:

# %% [markdown]
# ## 10. Model Comparison Visualizations

# %%
# Load model comparison images if they exist
print("="*60)
print("MODEL COMPARISON VISUALIZATIONS")
print("="*60)

# Check for comparison images
comparison_images = {
    'combined': MODELS_PATH / "model_comparison_combined.png",
    'r2': MODELS_PATH / "model_comparison_r2.png"
}

for img_name, img_path in comparison_images.items():
    if img_path.exists():
        print(f"\nüìä Displaying {img_name} comparison:")
        try:
            img = plt.imread(img_path)
            plt.figure(figsize=(12, 8))
            plt.imshow(img)
            plt.axis('off')
            plt.title(f'Model Comparison - {img_name.upper()}')
            plt.show()
        except Exception as e:
            print(f"Error displaying {img_name}: {e}")
    else:
        print(f"\n‚ÑπÔ∏è {img_name} image not found at {img_path}")


In [None]:

# %% [markdown]
# ## 11. Prediction Analysis

# %%
def analyze_predictions():
    """Analyze model predictions vs actual values"""
    print("="*60)
    print("PREDICTION ANALYSIS")
    print("="*60)
    
    # Try to load test data
    test_path = DATA_PATH / "processed" / "test_data.csv"
    if test_path.exists():
        test_df = pd.read_csv(test_path)
        
        # Check for target columns
        target_columns = ['Log_TotalClaims', 'TotalClaims']
        available_targets = [col for col in target_columns if col in test_df.columns]
        
        if available_targets:
            # Use the first available target
            target_col = available_targets[0]
            y_true = test_df[target_col]
            
            print(f"Using target variable: {target_col}")
            
            # Load best model
            best_model_name = None
            if 'model_comparison' in model_results:
                try:
                    model_data = model_results['model_comparison']
                    if isinstance(model_data, dict):
                        # Find best model by R¬≤
                        best_r2 = -np.inf
                        for model, metrics in model_data.items():
                            if isinstance(metrics, dict):
                                r2 = metrics.get('r2', metrics.get('R2', -np.inf))
                                if r2 > best_r2:
                                    best_r2 = r2
                                    best_model_name = model
                except:
                    pass
            
            # Also check the 'best_model' key
            if not best_model_name and 'best_model' in model_results:
                best_model_name = model_results['best_model']
            
            if best_model_name:
                model_path = MODELS_PATH / f"{best_model_name}.pkl"
                if model_path.exists():
                    try:
                        with open(model_path, 'rb') as f:
                            best_model = pickle.load(f)
                        
                        # Load preprocessor
                        preprocessor_path = MODELS_PATH / "preprocessor.pkl"
                        if preprocessor_path.exists():
                            with open(preprocessor_path, 'rb') as f:
                                preprocessor = pickle.load(f)
                            
                            # Prepare features
                            X_test = test_df.drop(columns=target_columns + ['HighClaim'], 
                                                errors='ignore')
                            
                            # Transform features
                            X_test_transformed = preprocessor.transform(X_test)
                            
                            # Make predictions
                            y_pred = best_model.predict(X_test_transformed)
                            
                            # Create prediction analysis
                            pred_df = pd.DataFrame({
                                'Actual': y_true,
                                'Predicted': y_pred,
                                'Residual': y_true - y_pred,
                                'Absolute_Error': np.abs(y_true - y_pred)
                            })
                            
                            # Calculate metrics
                            mae = mean_absolute_error(y_true, y_pred)
                            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                            r2 = r2_score(y_true, y_pred)
                            
                            print(f"\nüìà Prediction Metrics for {best_model_name}:")
                            print(f"  R¬≤: {r2:.4f}")
                            print(f"  MAE: {mae:.4f}")
                            print(f"  RMSE: {rmse:.4f}")
                            
                            # Plot predictions vs actual
                            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
                            
                            # Scatter plot
                            axes[0, 0].scatter(y_true, y_pred, alpha=0.5, color='blue')
                            axes[0, 0].plot([y_true.min(), y_true.max()], 
                                           [y_true.min(), y_true.max()], 
                                           'r--', lw=2)
                            axes[0, 0].set_xlabel('Actual')
                            axes[0, 0].set_ylabel('Predicted')
                            axes[0, 0].set_title(f'Predictions vs Actual ({target_col})')
                            axes[0, 0].grid(True, alpha=0.3)
                            
                            # Residual plot
                            axes[0, 1].scatter(y_pred, pred_df['Residual'], alpha=0.5, color='green')
                            axes[0, 1].axhline(y=0, color='r', linestyle='--')
                            axes[0, 1].set_xlabel('Predicted')
                            axes[0, 1].set_ylabel('Residuals')
                            axes[0, 1].set_title('Residual Plot')
                            axes[0, 1].grid(True, alpha=0.3)
                            
                            # Error distribution
                            axes[1, 0].hist(pred_df['Absolute_Error'], bins=30, 
                                           edgecolor='black', alpha=0.7, color='orange')
                            axes[1, 0].axvline(mae, color='red', linestyle='--', 
                                             label=f'Mean AE: {mae:.4f}')
                            axes[1, 0].set_xlabel('Absolute Error')
                            axes[1, 0].set_ylabel('Frequency')
                            axes[1, 0].set_title('Absolute Error Distribution')
                            axes[1, 0].legend()
                            axes[1, 0].grid(True, alpha=0.3)
                            
                            # Prediction error by actual value
                            axes[1, 1].scatter(y_true, pred_df['Absolute_Error'], 
                                             alpha=0.5, color='purple')
                            axes[1, 1].set_xlabel('Actual Value')
                            axes[1, 1].set_ylabel('Absolute Error')
                            axes[1, 1].set_title('Error by Actual Value')
                            axes[1, 1].grid(True, alpha=0.3)
                            
                            plt.tight_layout()
                            plt.show()
                            
                            # Display worst predictions
                            print("\n‚ö†Ô∏è Top 10 Worst Predictions (Highest Absolute Error):")
                            worst_preds = pred_df.nlargest(10, 'Absolute_Error')
                            worst_preds_display = worst_preds.copy()
                            worst_preds_display['Actual'] = worst_preds_display['Actual'].apply(lambda x: f"{x:.2f}")
                            worst_preds_display['Predicted'] = worst_preds_display['Predicted'].apply(lambda x: f"{x:.2f}")
                            worst_preds_display['Error'] = worst_preds_display['Absolute_Error'].apply(lambda x: f"{x:.2f}")
                            display(worst_preds_display[['Actual', 'Predicted', 'Error']])
                            
                    except Exception as e:
                        print(f"Error analyzing predictions: {e}")
                        import traceback
                        traceback.print_exc()
                else:
                    print(f"Best model {best_model_name} not found at {model_path}")
            else:
                print("Could not determine best model from comparison results")
        else:
            print("No target variables found in test data")
    else:
        print(f"Test data not found at {test_path}")

# Run prediction analysis
analyze_predictions()


In [None]:

# %% [markdown]
# ## 12. Business Impact Analysis

# %%
print("="*60)
print("BUSINESS IMPACT ANALYSIS")
print("="*60)

# Calculate potential business impact
if 'claim_policies' in data_dict:
    df = data_dict['claim_policies']
    
    # Get best model metrics
    best_model_name = None
    best_r2 = None
    best_rmse = None
    
    if 'model_comparison' in model_results:
        try:
            model_data = model_results['model_comparison']
            if isinstance(model_data, dict):
                # Find best model by R¬≤
                best_r2 = -np.inf
                for model, metrics in model_data.items():
                    if isinstance(metrics, dict):
                        r2 = metrics.get('r2', metrics.get('R2', -np.inf))
                        if r2 > best_r2:
                            best_r2 = r2
                            best_model_name = model
                            rmse = metrics.get('rmse', metrics.get('RMSE', None))
                            if rmse:
                                best_rmse = rmse
        except:
            pass
    
    # Also check the 'best_model' key
    if not best_model_name and 'best_model' in model_results:
        best_model_name = model_results['best_model']
    
    # Business metrics
    if 'TotalPremium' in df.columns and 'TotalClaims' in df.columns:
        total_premium = df['TotalPremium'].sum()
        total_claims = df['TotalClaims'].sum()
        avg_claim = df['TotalClaims'].mean()
        loss_ratio = total_claims / total_premium * 100 if total_premium > 0 else 0
        
        print(f"\nüí∞ Business Metrics:")
        print(f"  Total Premium: R{total_premium:,.2f}")
        print(f"  Total Claims: R{total_claims:,.2f}")
        print(f"  Average Claim: R{avg_claim:,.2f}")
        print(f"  Loss Ratio: {loss_ratio:.2f}%")
        
        if best_model_name:
            print(f"\nüéØ Best Model: {best_model_name}")
            
            if best_r2:
                print(f"  R¬≤ Score: {best_r2:.4f}")
                if best_rmse:
                    print(f"  Prediction Error (RMSE): R{best_rmse:,.2f}")
                
                # Potential impact
                print("\nüìà Potential Business Impact:")
                if best_r2 > 0.5:
                    print(f"  ‚Ä¢ Model explains {best_r2*100:.1f}% of claim variability")
                    if best_rmse:
                        print(f"  ‚Ä¢ Average prediction error: R{best_rmse:,.2f}")
                    print(f"  ‚Ä¢ Better pricing accuracy could improve profitability")
                elif best_r2 > 0.3:
                    print(f"  ‚Ä¢ Model explains {best_r2*100:.1f}% of claim variability")
                    print(f"  ‚Ä¢ Moderate predictive power - useful for risk assessment")
                else:
                    print(f"  ‚Ä¢ Model explains only {best_r2*100:.1f}% of claim variability")
                    print(f"  ‚Ä¢ Consider feature engineering or alternative approaches")
            else:
                print("  R¬≤ score not available for best model")
            
            if loss_ratio > 100:
                print(f"  ‚ö†Ô∏è  High loss ratio ({loss_ratio:.1f}%) indicates underpricing")
                print(f"  ‚úÖ Model can help identify optimal premium levels")
            elif loss_ratio < 70:
                print(f"  ‚úÖ Good loss ratio ({loss_ratio:.1f}%) indicates healthy margins")
                print(f"  üìä Model can help maintain competitive pricing")
            else:
                print(f"  üìä Loss ratio ({loss_ratio:.1f}%) within typical range")
                print(f"  üîç Model can optimize for profitability and growth")
        else:
            print("\n‚ö†Ô∏è  Model Performance Note:")
            print("  Could not determine best model from results")
    else:
        print("Required columns (TotalPremium, TotalClaims) not found in data")


In [None]:

# %% [markdown]
# ## 13. Export Summary Report

# %%
def create_summary_report():
    """Create a comprehensive summary report"""
    print("="*60)
    print("SUMMARY REPORT GENERATION")
    print("="*60)
    
    report_data = {
        "timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
        "data_statistics": {},
        "model_performance": {},
        "best_model": {},
        "business_insights": []
    }
    
    # Data statistics
    if 'claim_policies' in data_dict:
        df = data_dict['claim_policies']
        report_data["data_statistics"] = {
            "total_records": len(df),
            "total_features": len(df.columns)
        }
        
        if 'TotalPremium' in df.columns:
            report_data["data_statistics"]["total_premium"] = float(df['TotalPremium'].sum())
        
        if 'TotalClaims' in df.columns:
            report_data["data_statistics"]["total_claims"] = float(df['TotalClaims'].sum())
            report_data["data_statistics"]["average_claim"] = float(df['TotalClaims'].mean())
            report_data["data_statistics"]["max_claim"] = float(df['TotalClaims'].max())
            report_data["data_statistics"]["min_claim"] = float(df['TotalClaims'].min())
    
    # Model performance
    if 'model_comparison' in model_results:
        model_data = model_results['model_comparison']
        report_data["model_performance"] = model_data
        
        # Try to find best model
        if isinstance(model_data, dict):
            best_model = None
            best_r2 = -np.inf
            
            for model_name, metrics in model_data.items():
                if isinstance(metrics, dict):
                    r2 = metrics.get('r2', metrics.get('R2', -np.inf))
                    if r2 > best_r2:
                        best_r2 = r2
                        best_model = model_name
            
            if best_model:
                report_data["best_model"] = {
                    "name": best_model,
                    "r2": float(best_r2) if best_r2 != -np.inf else None
                }
                
                if isinstance(model_data[best_model], dict):
                    metrics = model_data[best_model]
                    if 'mae' in metrics or 'MAE' in metrics:
                        mae = metrics.get('mae', metrics.get('MAE'))
                        if mae is not None:
                            report_data["best_model"]["mae"] = float(mae)
                    if 'rmse' in metrics or 'RMSE' in metrics:
                        rmse = metrics.get('rmse', metrics.get('RMSE'))
                        if rmse is not None:
                            report_data["best_model"]["rmse"] = float(rmse)
                    if 'training_time' in metrics or 'Training_Time' in metrics:
                        train_time = metrics.get('training_time', metrics.get('Training_Time'))
                        if train_time is not None:
                            report_data["best_model"]["training_time"] = float(train_time)
    
    # Also check the explicit best_model key
    if 'best_model' in model_results and not report_data["best_model"]:
        report_data["best_model"]["name"] = model_results['best_model']
    
    # Business insights
    if 'task4_report' in model_results:
        report = model_results['task4_report']
        if 'key_insights' in report:
            report_data["business_insights"] = report['key_insights']
        elif 'insights' in report and isinstance(report['insights'], list):
            report_data["business_insights"] = report['insights']
    
    # Save report
    report_path = RESULTS_PATH / "modeling_eda_summary.json"
    with open(report_path, 'w') as f:
        json.dump(report_data, f, indent=2, default=str)
    
    print(f"\n‚úÖ Summary report saved to: {report_path}")
    
    # Display report summary
    print("\nüìã Report Summary:")
    print(f"  Generated: {report_data['timestamp']}")
    print(f"  Records analyzed: {report_data['data_statistics'].get('total_records', 'N/A')}")
    print(f"  Best model: {report_data['best_model'].get('name', 'N/A')}")
    if report_data['best_model'].get('r2'):
        print(f"  Best R¬≤: {report_data['best_model']['r2']:.4f}")
    
    return report_data

# Generate summary report
summary_report = create_summary_report()


In [None]:

# %% [markdown]
# ## 14. Interactive Dashboard (Optional - requires Plotly)

# %%
# Create an interactive dashboard if plotly is available
try:
    import plotly.express as px
    
    print("="*60)
    print("INTERACTIVE DASHBOARD")
    print("="*60)
    
    if 'model_comparison' in model_results:
        # Try to create comparison DataFrame
        model_data = model_results['model_comparison']
        
        if isinstance(model_data, dict):
            # Extract data for plotting
            plot_data = []
            for model_name, metrics in model_data.items():
                if isinstance(metrics, dict):
                    r2 = metrics.get('r2', metrics.get('R2', 0))
                    if r2 != 0:  # Only include models with R2 data
                        plot_data.append({
                            'Model': model_name,
                            'R2': r2,
                            'MAE': metrics.get('mae', metrics.get('MAE', 0)),
                            'RMSE': metrics.get('rmse', metrics.get('RMSE', 0))
                        })
            
            if plot_data:
                comp_df = pd.DataFrame(plot_data)
                
                # Create interactive bar chart
                fig = px.bar(comp_df, 
                             x='Model', 
                             y='R2',
                             title='Model R¬≤ Scores - Interactive View',
                             color='R2',
                             color_continuous_scale='Viridis',
                             text='R2')
                
                fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
                fig.update_layout(xaxis_title='Model',
                                 yaxis_title='R¬≤ Score',
                                 showlegend=False)
                
                fig.show()
                
                # Create scatter plot of R2 vs RMSE
                fig2 = px.scatter(comp_df,
                                 x='R2',
                                 y='RMSE',
                                 text='Model',
                                 title='Model Performance: R¬≤ vs RMSE',
                                 labels={'R2': 'R¬≤ Score', 'RMSE': 'Root Mean Squared Error'})
                
                fig2.update_traces(textposition='top center', marker=dict(size=12))
                fig2.show()
                
            else:
                print("No model data available for interactive visualization")
    
    # Try to create scatter plot from data
    if 'claim_policies' in data_dict:
        df = data_dict['claim_policies']
        if 'TotalClaims' in df.columns and 'VehicleAge' in df.columns:
            # Sample data for visualization (avoid too many points)
            sample_size = min(1000, len(df))
            sample_df = df.sample(sample_size) if sample_size < len(df) else df
            
            fig3 = px.scatter(sample_df,
                             x='VehicleAge',
                             y='TotalClaims',
                             color='TotalClaims',
                             size='SumInsured' if 'SumInsured' in df.columns else None,
                             hover_data=['VehicleType', 'Province'] if 'VehicleType' in df.columns and 'Province' in df.columns else None,
                             title='Claims by Vehicle Age and Sum Insured',
                             labels={'TotalClaims': 'Claim Amount (R)', 
                                    'VehicleAge': 'Vehicle Age (years)'})
            fig3.show()
                
except ImportError:
    print("\n‚ÑπÔ∏è Plotly not available for interactive visualizations")
    print("Install with: pip install plotly")
except Exception as e:
    print(f"\nError creating interactive visualizations: {e}")

# %% [markdown]
# ## 15. Conclusion and Next Steps

# %%
print("="*60)
print("CONCLUSION AND NEXT STEPS")
print("="*60)

print("\nüéØ Key Findings:")
print("1. Data Quality: Check missing values and data types above")
print("2. Model Performance: Review model comparison metrics")
print("3. Best Model: Identify from the comparison results")
print("4. Feature Importance: Check which features drive predictions")

print("\nüöÄ Recommended Next Steps:")
print("1. Review model performance and select best model for deployment")
print("2. Validate predictions on new/unseen data")
print("3. Implement model monitoring for production")
print("4. Conduct feature engineering to improve model performance")
print("5. Develop business rules based on model insights")

print("\nüìä Additional Analyses to Consider:")
print("‚Ä¢ Time-series analysis of claim patterns")
print("‚Ä¢ Geospatial analysis of claim hotspots")
print("‚Ä¢ Customer segmentation based on risk profiles")
print("‚Ä¢ Cost-benefit analysis of model implementation")

print("\n" + "="*60)
print("‚úÖ MODELING EDA COMPLETE")
print("="*60)

# %% [markdown]
# ## 16. Debug Information (Optional)

# %%
# Optional: Print debug information about data structures
print("="*60)
print("DEBUG INFORMATION")
print("="*60)

print(f"\nüìÅ Data files loaded: {list(data_dict.keys())}")

print(f"\nüìä Model results loaded:")
for key in model_results.keys():
    if key != 'task4_md':  # Don't print large markdown
        print(f"  - {key}: {type(model_results[key])}")

# Check model_comparison structure in detail
if 'model_comparison' in model_results:
    print("\nüîç Detailed model_comparison structure:")
    model_data = model_results['model_comparison']
    if isinstance(model_data, dict):
        print(f"  Number of models: {len(model_data)}")
        print(f"  Models: {list(model_data.keys())}")
        
        # Show structure of first few models
        for i, (model_name, metrics) in enumerate(list(model_data.items())[:3]):
            print(f"\n  Model {i+1}: {model_name}")
            if isinstance(metrics, dict):
                print(f"    Type: dict with {len(metrics)} keys")
                print(f"    Keys: {list(metrics.keys())}")
                # Show sample values
                for key, value in list(metrics.items())[:3]:
                    print(f"      {key}: {value}")
                if len(metrics) > 3:
                    print(f"      ... and {len(metrics) - 3} more")
            else:
                print(f"    Type: {type(metrics)}")
                print(f"    Value: {metrics}")

# Check if model files exist
print("\nüîç Checking model files:")
model_files = list(MODELS_PATH.glob("*.pkl"))
print(f"  Found {len(model_files)} .pkl files")
for file in model_files[:10]:  # Show first 10
    print(f"    {file.name}")
if len(model_files) > 10:
    print(f"    ... and {len(model_files) - 10} more")