# Insurance Risk-Based Pricing ML Pipeline
### Task 4: Build and evaluate predictive models for dynamic, risk-based pricing


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (mean_squared_error, r2_score, accuracy_score, 
                           precision_score, recall_score, f1_score, 
                           classification_report, confusion_matrix, roc_auc_score)
import xgboost as xgb
import shap
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [7]:
class InsurancePricingML:
    
    def __init__(self):
        self.data = None
        self.claim_data = None  # Subset with claims > 0
        self.features = None
        self.target_claim_severity = None
        self.target_claim_probability = None
        self.target_premium = None
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.models = {}
        self.results = {}
        
    def load_data(self, file_path):
        """Load and initial data inspection"""
        try:
            # Assuming the data is pipe-delimited based on the sample
            self.data = pd.read_csv(file_path, delimiter='|')
            print(f"Data loaded successfully: {self.data.shape}")
            print(f"Columns: {list(self.data.columns)}")
            print(f"\nFirst few rows:")
            print(self.data.head())
            return True
        except Exception as e:
            print(f"Error loading data: {e}")
            return False
    
    def explore_data(self):
        """Comprehensive data exploration"""
        print("="*50)
        print("DATA EXPLORATION")
        print("="*50)
        
        # Basic info
        print(f"Dataset shape: {self.data.shape}")
        print(f"\nMissing values:")
        print(self.data.isnull().sum()[self.data.isnull().sum() > 0])
        
        # Claims analysis
        print(f"\nClaims Analysis:")
        print(f"Total policies: {len(self.data)}")
        print(f"Policies with claims: {len(self.data[self.data['TotalClaims'] > 0])}")
        print(f"Claim frequency: {len(self.data[self.data['TotalClaims'] > 0]) / len(self.data):.2%}")
        
        # Target variable distributions
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # TotalClaims distribution
        axes[0,0].hist(self.data['TotalClaims'], bins=50, alpha=0.7)
        axes[0,0].set_title('TotalClaims Distribution')
        axes[0,0].set_xlabel('Total Claims')
        
        # TotalClaims for claims > 0
        claim_data = self.data[self.data['TotalClaims'] > 0]
        axes[0,1].hist(claim_data['TotalClaims'], bins=30, alpha=0.7, color='orange')
        axes[0,1].set_title('TotalClaims Distribution (Claims > 0)')
        axes[0,1].set_xlabel('Total Claims')
        
        # TotalPremium distribution
        axes[1,0].hist(self.data['TotalPremium'], bins=50, alpha=0.7, color='green')
        axes[1,0].set_title('TotalPremium Distribution')
        axes[1,0].set_xlabel('Total Premium')
        
        # CalculatedPremiumPerTerm distribution
        axes[1,1].hist(self.data['CalculatedPremiumPerTerm'], bins=50, alpha=0.7, color='red')
        axes[1,1].set_title('CalculatedPremiumPerTerm Distribution')
        axes[1,1].set_xlabel('Premium Per Term')
        
        plt.tight_layout()
        plt.show()
        
        # Correlation with target variables
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        print(f"\nCorrelation with TotalClaims:")
        correlations = self.data[numeric_cols].corr()['TotalClaims'].sort_values(ascending=False)
        print(correlations[correlations.index != 'TotalClaims'].head(10))
    
    def feature_engineering(self):
        """Create new features and prepare data"""
        print("="*50)
        print("FEATURE ENGINEERING")
        print("="*50)
        
        # Create a copy for processing
        df = self.data.copy()
        
        # Parse TransactionMonth
        df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])
        df['TransactionYear'] = df['TransactionMonth'].dt.year
        df['TransactionMonth_num'] = df['TransactionMonth'].dt.month
        
        # Vehicle age
        df['VehicleAge'] = df['TransactionYear'] - df['RegistrationYear']
        
        # Create claim indicator
        df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)
        
        # Premium to sum insured ratio
        df['PremiumToSumInsuredRatio'] = df['TotalPremium'] / (df['SumInsured'] + 1)
        
        # Claim to premium ratio (for policies with claims)
        df['ClaimToPremiumRatio'] = df['TotalClaims'] / (df['TotalPremium'] + 1)
        
        # Engine power indicators
        df['HighPowerEngine'] = (df['kilowatts'] > df['kilowatts'].median()).astype(int)
        df['LargeEngine'] = (df['cubiccapacity'] > df['cubiccapacity'].median()).astype(int)
        
        # Province risk grouping (simplified)
        province_risk = df.groupby('Province')['HasClaim'].mean().to_dict()
        df['ProvinceRisk'] = df['Province'].map(province_risk)
        
        # Make risk grouping
        make_risk = df.groupby('make')['HasClaim'].mean().to_dict()
        df['MakeRisk'] = df['make'].map(make_risk)
        
        self.data = df
        print("Feature engineering completed")
        print(f"New features created: VehicleAge, HasClaim, PremiumToSumInsuredRatio, ClaimToPremiumRatio, etc.")
    
    def prepare_features(self):
        """Prepare features for modeling"""
        print("="*50)
        print("FEATURE PREPARATION")
        print("="*50)
        
        # Select relevant features
        numeric_features = [
            'RegistrationYear', 'Cylinders', 'cubiccapacity', 'kilowatts', 
            'NumberOfDoors', 'CapitalOutstanding', 'SumInsured', 
            'CalculatedPremiumPerTerm', 'VehicleAge', 'TransactionYear',
            'TransactionMonth_num', 'PremiumToSumInsuredRatio', 'ProvinceRisk', 'MakeRisk'
        ]
        
        categorical_features = [
            'IsVATRegistered', 'LegalType', 'Gender', 'Province', 'VehicleType',
            'make', 'bodytype', 'AlarmImmobiliser', 'TrackingDevice', 'NewVehicle',
            'TermFrequency', 'CoverCategory', 'CoverType', 'Product'
        ]
        
        # Clean numeric columns first - handle European decimal format
        def clean_numeric_column(series):
            """Clean numeric column by handling different decimal formats"""
            if series.dtype == 'object':  # String column
                # Replace comma with dot for decimal separator
                cleaned = series.astype(str).str.replace(',', '.', regex=False)
                # Convert to numeric, coercing errors to NaN
                return pd.to_numeric(cleaned, errors='coerce')
            else:
                return pd.to_numeric(series, errors='coerce')
        
        # Apply cleaning to numeric features
        for col in numeric_features:
            if col in self.data.columns:
                print(f"Cleaning numeric column: {col}")
                self.data[col] = clean_numeric_column(self.data[col])
        
        # Also clean the target variables
        target_columns = ['TotalClaims', 'TotalPremium', 'CalculatedPremiumPerTerm', 'SumInsured', 'CapitalOutstanding']
        for col in target_columns:
            if col in self.data.columns:
                print(f"Cleaning target column: {col}")
                self.data[col] = clean_numeric_column(self.data[col])
        
        # Handle missing values AFTER cleaning
        for col in numeric_features:
            if col in self.data.columns:
                median_val = self.data[col].median()
                if pd.isna(median_val):  # If median is NaN, use 0
                    median_val = 0
                self.data[col] = self.data[col].fillna(median_val)
        
        for col in categorical_features:
            if col in self.data.columns:
                self.data[col] = self.data[col].fillna('Unknown')
        
        # Encode categorical variables
        df_encoded = self.data.copy()
        
        for col in categorical_features:
            if col in df_encoded.columns:
                le = LabelEncoder()
                df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col].astype(str))
                self.label_encoders[col] = le
        
        # Select final features
        final_features = numeric_features + [col + '_encoded' for col in categorical_features 
                                           if col in self.data.columns]
        
        # Remove features that don't exist
        final_features = [f for f in final_features if f in df_encoded.columns]
        
        self.features = df_encoded[final_features]
        self.target_claim_severity = df_encoded['TotalClaims']
        self.target_claim_probability = df_encoded['HasClaim']
        self.target_premium = df_encoded['TotalPremium']
        
        # Create claim severity dataset (only policies with claims > 0)
        claim_mask = df_encoded['TotalClaims'] > 0
        self.claim_features = self.features[claim_mask]
        self.claim_target = self.target_claim_severity[claim_mask]
        
        print(f"Final feature matrix shape: {self.features.shape}")
        print(f"Claim severity dataset shape: {self.claim_features.shape}")
        print(f"Features used: {list(self.features.columns)}")
        
        # Display data types to verify cleaning
        print(f"\nData types after cleaning:")
        for col in ['TotalClaims', 'TotalPremium', 'CalculatedPremiumPerTerm']:
            if col in self.data.columns:
                print(f"{col}: {self.data[col].dtype}")
        
        print(f"Number of NaN values in TotalClaims: {self.data['TotalClaims'].isna().sum()}")
        print(f"Number of NaN values in TotalPremium: {self.data['TotalPremium'].isna().sum()}")
    
    def build_claim_severity_models(self):
        """Build models to predict claim severity (TotalClaims for policies with claims > 0)"""
        print("="*50)
        print("CLAIM SEVERITY PREDICTION MODELS")
        print("="*50)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            self.claim_features, self.claim_target, test_size=0.2, random_state=42
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        models = {
            'Linear Regression': LinearRegression(),
            'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=10),
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
            'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, max_depth=6)
        }
        
        severity_results = {}
        
        for name, model in models.items():
            print(f"\nTraining {name}...")
            
            # Use scaled features for Linear Regression, original for tree-based models
            if name == 'Linear Regression':
                model.fit(X_train_scaled, y_train)
                y_pred = model.predict(X_test_scaled)
            else:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
            
            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            severity_results[name] = {
                'model': model,
                'rmse': rmse,
                'r2': r2,
                'predictions': y_pred,
                'actual': y_test
            }
            
            print(f"{name} - RMSE: {rmse:.2f}, R²: {r2:.4f}")
        
        self.models['claim_severity'] = models
        self.results['claim_severity'] = severity_results
        
        # Plot results
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        axes = axes.ravel()
        
        for i, (name, result) in enumerate(severity_results.items()):
            axes[i].scatter(result['actual'], result['predictions'], alpha=0.5)
            axes[i].plot([result['actual'].min(), result['actual'].max()], 
                        [result['actual'].min(), result['actual'].max()], 'r--')
            axes[i].set_xlabel('Actual')
            axes[i].set_ylabel('Predicted')
            axes[i].set_title(f'{name}\nRMSE: {result["rmse"]:.2f}, R²: {result["r2"]:.4f}')
        
        plt.tight_layout()
        plt.show()
    
    def build_claim_probability_models(self):
        """Build models to predict probability of claim occurrence"""
        print("="*50)
        print("CLAIM PROBABILITY PREDICTION MODELS")
        print("="*50)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            self.features, self.target_claim_probability, test_size=0.2, random_state=42
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        models = {
            'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
            'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, max_depth=6)
        }
        
        probability_results = {}
        
        for name, model in models.items():
            print(f"\nTraining {name}...")
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            probability_results[name] = {
                'model': model,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'auc': auc,
                'predictions': y_pred,
                'probabilities': y_pred_proba,
                'actual': y_test
            }
            
            print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")
        
        self.models['claim_probability'] = models
        self.results['claim_probability'] = probability_results
        
        # Plot confusion matrices
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        
        for i, (name, result) in enumerate(probability_results.items()):
            cm = confusion_matrix(result['actual'], result['predictions'])
            sns.heatmap(cm, annot=True, fmt='d', ax=axes[i], cmap='Blues')
            axes[i].set_title(f'{name}\nAccuracy: {result["accuracy"]:.4f}')
            axes[i].set_xlabel('Predicted')
            axes[i].set_ylabel('Actual')
        
        plt.tight_layout()
        plt.show()
    
    def build_premium_models(self):
        """Build models to predict premium"""
        print("="*50)
        print("PREMIUM PREDICTION MODELS")
        print("="*50)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            self.features, self.target_premium, test_size=0.2, random_state=42
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        models = {
            'Linear Regression': LinearRegression(),
            'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=10),
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
            'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, max_depth=6)
        }
        
        premium_results = {}
        
        for name, model in models.items():
            print(f"\nTraining {name}...")
            
            # Use scaled features for Linear Regression, original for tree-based models
            if name == 'Linear Regression':
                model.fit(X_train_scaled, y_train)
                y_pred = model.predict(X_test_scaled)
            else:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
            
            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            premium_results[name] = {
                'model': model,
                'rmse': rmse,
                'r2': r2,
                'predictions': y_pred,
                'actual': y_test
            }
            
            print(f"{name} - RMSE: {rmse:.2f}, R²: {r2:.4f}")
        
        self.models['premium'] = models
        self.results['premium'] = premium_results
    
    def model_interpretability(self, model_type='claim_severity'):
        """Use SHAP for model interpretability"""
        print("="*50)
        print(f"MODEL INTERPRETABILITY - {model_type.upper()}")
        print("="*50)
        
        if model_type not in self.results:
            print(f"No results found for {model_type}")
            return
        
        # Get the best model (highest R² for regression, highest F1 for classification)
        if model_type in ['claim_severity', 'premium']:
            best_model_name = max(self.results[model_type].keys(), 
                                key=lambda x: self.results[model_type][x]['r2'])
            best_model = self.models[model_type][best_model_name]
            print(f"Analyzing best model: {best_model_name}")
            
            # Use appropriate dataset
            if model_type == 'claim_severity':
                X = self.claim_features
            else:
                X = self.features
            
        else:  # claim_probability
            best_model_name = max(self.results[model_type].keys(), 
                                key=lambda x: self.results[model_type][x]['f1'])
            best_model = self.models[model_type][best_model_name]
            X = self.features
        
        # SHAP analysis
        try:
            if best_model_name in ['Random Forest', 'XGBoost']:
                # For tree-based models
                explainer = shap.TreeExplainer(best_model)
                shap_values = explainer.shap_values(X.iloc[:1000])  # Sample for performance
                
                # Summary plot
                plt.figure(figsize=(10, 6))
                shap.summary_plot(shap_values, X.iloc[:1000], show=False)
                plt.title(f'SHAP Summary Plot - {best_model_name}')
                plt.tight_layout()
                plt.show()
                
                # Feature importance
                feature_importance = pd.DataFrame({
                    'feature': X.columns,
                    'importance': np.abs(shap_values).mean(0)
                }).sort_values('importance', ascending=False)
                
                print(f"\nTop 10 Most Important Features ({best_model_name}):")
                print(feature_importance.head(10))
                
                # Plot feature importance
                plt.figure(figsize=(10, 6))
                plt.barh(range(10), feature_importance.head(10)['importance'])
                plt.yticks(range(10), feature_importance.head(10)['feature'])
                plt.xlabel('Mean |SHAP value|')
                plt.title(f'Top 10 Feature Importance - {best_model_name}')
                plt.gca().invert_yaxis()
                plt.tight_layout()
                plt.show()
                
        except Exception as e:
            print(f"SHAP analysis failed: {e}")
            
            # Fallback to feature importance for tree-based models
            if hasattr(best_model, 'feature_importances_'):
                feature_importance = pd.DataFrame({
                    'feature': X.columns,
                    'importance': best_model.feature_importances_
                }).sort_values('importance', ascending=False)
                
                print(f"\nTop 10 Most Important Features ({best_model_name}):")
                print(feature_importance.head(10))
                
                # Plot feature importance
                plt.figure(figsize=(10, 6))
                plt.barh(range(10), feature_importance.head(10)['importance'])
                plt.yticks(range(10), feature_importance.head(10)['feature'])
                plt.xlabel('Feature Importance')
                plt.title(f'Top 10 Feature Importance - {best_model_name}')
                plt.gca().invert_yaxis()
                plt.tight_layout()
                plt.show()
    
    def generate_risk_based_premium(self, sample_size=1000):
        """Generate risk-based premium using the formula:
        Premium = (Predicted Probability of Claim * Predicted Claim Severity) + Expense Loading + Profit Margin
        """
        print("="*50)
        print("RISK-BASED PREMIUM CALCULATION")
        print("="*50)
        
        # Get best models
        best_prob_model = max(self.results['claim_probability'].keys(), 
                            key=lambda x: self.results['claim_probability'][x]['f1'])
        best_severity_model = max(self.results['claim_severity'].keys(), 
                                key=lambda x: self.results['claim_severity'][x]['r2'])
        
        prob_model = self.models['claim_probability'][best_prob_model]
        severity_model = self.models['claim_severity'][best_severity_model]
        
        # Sample data for prediction
        sample_data = self.features.sample(n=min(sample_size, len(self.features)), random_state=42)
        
        # Predict claim probability
        claim_prob = prob_model.predict_proba(sample_data)[:, 1]
        
        # Predict claim severity
        claim_severity = severity_model.predict(sample_data)
        
        # Calculate risk-based premium
        expense_loading = 0.15  # 15% expense loading
        profit_margin = 0.10    # 10% profit margin
        
        risk_premium = claim_prob * claim_severity
        total_premium = risk_premium * (1 + expense_loading + profit_margin)
        
        # Compare with actual premiums
        actual_premium = self.target_premium.loc[sample_data.index]
        
        results_df = pd.DataFrame({
            'ClaimProbability': claim_prob,
            'ClaimSeverity': claim_severity,
            'RiskPremium': risk_premium,
            'TotalPremium_Predicted': total_premium,
            'TotalPremium_Actual': actual_premium,
            'Difference': total_premium - actual_premium
        })
        
        print(f"Risk-Based Premium Analysis (Sample of {len(results_df)} policies):")
        print(f"Average Claim Probability: {claim_prob.mean():.4f}")
        print(f"Average Claim Severity: {claim_severity.mean():.2f}")
        print(f"Average Risk Premium: {risk_premium.mean():.2f}")
        print(f"Average Total Premium (Predicted): {total_premium.mean():.2f}")
        print(f"Average Total Premium (Actual): {actual_premium.mean():.2f}")
        print(f"RMSE vs Actual Premium: {np.sqrt(mean_squared_error(actual_premium, total_premium)):.2f}")
        
        # Plot comparison
        plt.figure(figsize=(12, 8))
        
        plt.subplot(2, 2, 1)
        plt.scatter(actual_premium, total_premium, alpha=0.5)
        plt.plot([actual_premium.min(), actual_premium.max()], 
                [actual_premium.min(), actual_premium.max()], 'r--')
        plt.xlabel('Actual Premium')
        plt.ylabel('Predicted Premium')
        plt.title('Actual vs Risk-Based Premium')
        
        plt.subplot(2, 2, 2)
        plt.hist(results_df['Difference'], bins=30, alpha=0.7)
        plt.xlabel('Difference (Predicted - Actual)')
        plt.title('Premium Difference Distribution')
        
        plt.subplot(2, 2, 3)
        plt.scatter(claim_prob, total_premium, alpha=0.5)
        plt.xlabel('Claim Probability')
        plt.ylabel('Total Premium')
        plt.title('Claim Probability vs Premium')
        
        plt.subplot(2, 2, 4)
        plt.scatter(claim_severity, total_premium, alpha=0.5)
        plt.xlabel('Claim Severity')
        plt.ylabel('Total Premium')
        plt.title('Claim Severity vs Premium')
        
        plt.tight_layout()
        plt.show()
        
        return results_df
    
    def generate_model_comparison_report(self):
        """Generate comprehensive model comparison report"""
        print("="*50)
        print("COMPREHENSIVE MODEL COMPARISON REPORT")
        print("="*50)
        
        # Claim Severity Models
        print("\n1. CLAIM SEVERITY PREDICTION MODELS")
        print("-" * 40)
        severity_df = pd.DataFrame({
            'Model': list(self.results['claim_severity'].keys()),
            'RMSE': [self.results['claim_severity'][model]['rmse'] for model in self.results['claim_severity'].keys()],
            'R²': [self.results['claim_severity'][model]['r2'] for model in self.results['claim_severity'].keys()]
        }).sort_values('R²', ascending=False)
        print(severity_df.to_string(index=False))
        
        # Claim Probability Models
        print("\n2. CLAIM PROBABILITY PREDICTION MODELS")
        print("-" * 40)
        prob_df = pd.DataFrame({
            'Model': list(self.results['claim_probability'].keys()),
            'Accuracy': [self.results['claim_probability'][model]['accuracy'] for model in self.results['claim_probability'].keys()],
            'Precision': [self.results['claim_probability'][model]['precision'] for model in self.results['claim_probability'].keys()],
            'Recall': [self.results['claim_probability'][model]['recall'] for model in self.results['claim_probability'].keys()],
            'F1-Score': [self.results['claim_probability'][model]['f1'] for model in self.results['claim_probability'].keys()],
            'AUC': [self.results['claim_probability'][model]['auc'] for model in self.results['claim_probability'].keys()]
        }).sort_values('F1-Score', ascending=False)
        print(prob_df.to_string(index=False))
        
        # Premium Models
        print("\n3. PREMIUM PREDICTION MODELS")
        print("-" * 40)
        premium_df = pd.DataFrame({
            'Model': list(self.results['premium'].keys()),
            'RMSE': [self.results['premium'][model]['rmse'] for model in self.results['premium'].keys()],
            'R²': [self.results['premium'][model]['r2'] for model in self.results['premium'].keys()]
        }).sort_values('R²', ascending=False)
        print(premium_df.to_string(index=False))
        
        # Best Models Summary
        print("\n4. BEST PERFORMING MODELS")
        print("-" * 40)
        best_severity = severity_df.iloc[0]['Model']
        best_probability = prob_df.iloc[0]['Model']
        best_premium = premium_df.iloc[0]['Model']
        
        print(f"Best Claim Severity Model: {best_severity} (R² = {severity_df.iloc[0]['R²']:.4f})")
        print(f"Best Claim Probability Model: {best_probability} (F1 = {prob_df.iloc[0]['F1-Score']:.4f})")
        print(f"Best Premium Model: {best_premium} (R² = {premium_df.iloc[0]['R²']:.4f})")
        
        return {
            'claim_severity': severity_df,
            'claim_probability': prob_df,
            'premium': premium_df
        }
    
    def run_full_pipeline(self, file_path):
        """Run the complete ML pipeline"""
        print("Starting Insurance Risk-Based Pricing ML Pipeline...")
        
        # Load and explore data
        if not self.load_data(file_path):
            return
        
        self.explore_data()
        
        # Feature engineering and preparation
        self.feature_engineering()
        self.prepare_features()
        
        # Build models
        self.build_claim_severity_models()
        self.build_claim_probability_models()
        self.build_premium_models()
        
        # Model interpretability
        self.model_interpretability('claim_severity')
        self.model_interpretability('claim_probability')
        
        # Generate risk-based premium
        risk_premium_results = self.generate_risk_based_premium()
        
        # Generate comprehensive report
        model_comparison = self.generate_model_comparison_report()
        
        print("\n" + "="*50)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*50)
        
        return {
            'risk_premium_results': risk_premium_results,
            'model_comparison': model_comparison,
            'models': self.models,
            'results': self.results
        }


if __name__ == "__main__":
    # Initialize the pipeline
    pipeline = InsurancePricingML()
    
    # Run the complete pipeline
    # Replace 'your_data_file.csv' with your actual file path
    results = pipeline.run_full_pipeline('../data/cleaned/MachineLearningRating_v3.txt')
    
    print("\nPipeline completed! Check the results above for detailed analysis.")

Starting Insurance Risk-Based Pricing ML Pipeline...
Data loaded successfully: (1000098, 46)
Columns: ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims']

First few rows:
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  