In [3]:
import os
import sys

# Step 1: Get the parent directory of your current working directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Step 2: Add project root to sys.path if not already present
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Step 3: Confirm
print("Project root added to sys.path:", project_root in sys.path)
print("src.data_loader readable:", os.path.exists(os.path.join(project_root, "src", "data_loader.py")))



Project root added to sys.path: True
src.data_loader readable: True


In [4]:
from src.analysis.hypothesis_testing import (
    load_data,
    calculate_risk_metrics,
    run_all_hypothesis_tests,
    interpret_results
)

In [5]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [7]:
# Cell 2: Data Preparation
print("Preparing data for modeling...")

# Load data
df = load_data("../data/MachineLearningRating_v3.txt")
print("Data loaded successfully!")

# Print the first few rows to understand the structure
print("\nFirst few rows of raw data:")
print(df.head())

# Print column names
print("\nColumn names:")
print(df.columns.tolist())

# Print data types
print("\nData types:")
print(df.dtypes)

# Print basic information about the dataset
print("\nDataset information:")
print(df.info())

Preparing data for modeling...
Data loaded successfully!

First few rows of raw data:
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...             

In [8]:
# Cell 2: Data Preparation
print("Preparing data for modeling...")

# Load data
df = load_data("../data/MachineLearningRating_v3.txt")
print("Data loaded successfully!")

# Convert TransactionMonth to datetime
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])

# Define categorical and numeric features
categorical_features = ['Province', 'PostalCode', 'Gender', 'VehicleType', 'make', 
                       'Model', 'bodytype', 'CoverCategory', 'CoverType', 'CoverGroup',
                       'LegalType', 'Country', 'MainCrestaZone', 'SubCrestaZone']

numeric_features = ['mmcode', 'Cylinders', 'cubiccapacity', 'kilowatts', 
                   'NumberOfDoors', 'CustomValueEstimate', 'SumInsured',
                   'RegistrationYear', 'CalculatedPremiumPerTerm']

# Create target variables
df['has_claim'] = df['TotalClaims'] > 0
df['claim_severity'] = df.apply(lambda x: x['TotalClaims'] if x['has_claim'] else 0, axis=1)

# Handle missing values
for col in numeric_features:
    df[col] = df[col].fillna(df[col].median())

for col in categorical_features:
    df[col] = df[col].fillna(df[col].mode()[0])

print("\nData preparation completed!")
print("\nSample of prepared data:")
print(df[categorical_features + numeric_features + ['TotalClaims', 'has_claim', 'claim_severity']].head())

# Print basic statistics
print("\nBasic statistics of target variables:")
print("\nClaim Frequency:")
print(df['has_claim'].value_counts(normalize=True))
print("\nClaim Severity (for claims > 0):")
print(df[df['has_claim']]['claim_severity'].describe())

Preparing data for modeling...
Data loaded successfully!

Data preparation completed!

Sample of prepared data:
  Province  PostalCode         Gender        VehicleType           make  \
0  Gauteng        1459  Not specified  Passenger Vehicle  MERCEDES-BENZ   
1  Gauteng        1459  Not specified  Passenger Vehicle  MERCEDES-BENZ   
2  Gauteng        1459  Not specified  Passenger Vehicle  MERCEDES-BENZ   
3  Gauteng        1459  Not specified  Passenger Vehicle  MERCEDES-BENZ   
4  Gauteng        1459  Not specified  Passenger Vehicle  MERCEDES-BENZ   

   Model bodytype CoverCategory   CoverType            CoverGroup  ...  \
0  E 240      S/D    Windscreen  Windscreen  Comprehensive - Taxi  ...   
1  E 240      S/D    Windscreen  Windscreen  Comprehensive - Taxi  ...   
2  E 240      S/D    Windscreen  Windscreen  Comprehensive - Taxi  ...   
3  E 240      S/D    Own damage  Own Damage  Comprehensive - Taxi  ...   
4  E 240      S/D    Own damage  Own Damage  Comprehensive - Taxi  

In [9]:
# Cell 3: Feature Engineering
print("Performing feature engineering...")

# Create feature engineering pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Prepare data for modeling
X = df[categorical_features + numeric_features]
y_severity = df[df['has_claim']]['claim_severity']
y_probability = df['has_claim']

# Split data for severity model
X_severity = X[df['has_claim']]
X_train_sev, X_test_sev, y_train_sev, y_test_sev = train_test_split(
    X_severity, y_severity, test_size=0.2, random_state=42
)

# Split data for probability model
X_train_prob, X_test_prob, y_train_prob, y_test_prob = train_test_split(
    X, y_probability, test_size=0.2, random_state=42
)

print("\nFeature engineering completed!")
print(f"Training set size for severity model: {X_train_sev.shape}")
print(f"Training set size for probability model: {X_train_prob.shape}")

Performing feature engineering...

Feature engineering completed!
Training set size for severity model: (2230, 23)
Training set size for probability model: (800078, 23)


In [10]:
# Cell 4: Model Building - Claim Severity
print("Building claim severity models...")

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate severity models
severity_results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train model
    pipeline.fit(X_train_sev, y_train_sev)
    
    # Make predictions
    y_pred = pipeline.predict(X_test_sev)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test_sev, y_pred))
    r2 = r2_score(y_test_sev, y_pred)
    
    severity_results[name] = {
        'RMSE': rmse,
        'R2': r2,
        'model': pipeline
    }
    
    print(f"RMSE: {rmse:,.2f}")
    print(f"R2 Score: {r2:.4f}")

# Display results
print("\nSeverity Model Comparison:")
results_df = pd.DataFrame(severity_results).T
print(results_df[['RMSE', 'R2']])

Building claim severity models...

Training Linear Regression...
RMSE: 38,548.92
R2 Score: 0.0760

Training Random Forest...
RMSE: 40,110.15
R2 Score: -0.0004

Training XGBoost...
RMSE: 40,334.05
R2 Score: -0.0116

Severity Model Comparison:
                           RMSE        R2
Linear Regression  38548.921672  0.076001
Random Forest      40110.152733 -0.000358
XGBoost            40334.052065 -0.011558


In [None]:
# Cell 5: Model Building - Claim Probability
print("Building claim probability models...")

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate probability models
probability_results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train model
    pipeline.fit(X_train_prob, y_train_prob)
    
    # Make predictions
    y_pred = pipeline.predict(X_test_prob)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test_prob, y_pred))
    r2 = r2_score(y_test_prob, y_pred)
    
    probability_results[name] = {
        'RMSE': rmse,
        'R2': r2,
        'model': pipeline
    }
    
    print(f"RMSE: {rmse:,.2f}")
    print(f"R2 Score: {r2:.4f}")

# Display results
print("\nProbability Model Comparison:")
results_df = pd.DataFrame(probability_results).T
print(results_df[['RMSE', 'R2']])

Building claim probability models...

Training Linear Regression...
RMSE: 0.05
R2 Score: 0.0083

Training Random Forest...


In [None]:
# Cell 6: Feature Importance Analysis
print("Analyzing feature importance...")

# Get the best model for each task
best_severity_model = severity_results[max(severity_results, key=lambda x: severity_results[x]['R2'])]['model']
best_probability_model = probability_results[max(probability_results, key=lambda x: probability_results[x]['R2'])]['model']

# Get feature names after preprocessing
feature_names = (numeric_features + 
                best_severity_model.named_steps['preprocessor']
                .named_transformers_['cat']
                .named_steps['onehot']
                .get_feature_names_out(categorical_features))

# Calculate SHAP values for severity model
print("\nCalculating SHAP values for severity model...")
explainer_severity = shap.TreeExplainer(best_severity_model.named_steps['model'])
shap_values_severity = explainer_severity.shap_values(X_test_sev)

# Plot SHAP summary for severity
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values_severity, X_test_sev, feature_names=feature_names)
plt.title('Feature Importance - Claim Severity')
plt.show()

# Calculate SHAP values for probability model
print("\nCalculating SHAP values for probability model...")
explainer_probability = shap.TreeExplainer(best_probability_model.named_steps['model'])
shap_values_probability = explainer_probability.shap_values(X_test_prob)

# Plot SHAP summary for probability
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values_probability, X_test_prob, feature_names=feature_names)
plt.title('Feature Importance - Claim Probability')
plt.show()

In [None]:
# Cell 7: Premium Optimization
print("Calculating optimized premiums...")

# Get predictions from best models
severity_pred = best_severity_model.predict(X)
probability_pred = best_probability_model.predict(X)

# Calculate risk-based premium
expense_loading = 0.1  # 10% expense loading
profit_margin = 0.15   # 15% profit margin

df['optimized_premium'] = (probability_pred * severity_pred) * (1 + expense_loading + profit_margin)

# Compare with actual premium
print("\nPremium Comparison:")
print(df[['TotalPremium', 'optimized_premium']].describe())

# Visualize premium distribution
plt.figure(figsize=(10, 6))
plt.hist(df['TotalPremium'], bins=50, alpha=0.5, label='Actual Premium')
plt.hist(df['optimized_premium'], bins=50, alpha=0.5, label='Optimized Premium')
plt.title('Distribution of Actual vs Optimized Premiums')
plt.xlabel('Premium Amount')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
# Cell 8: Summary and Recommendations
print("Summary and Recommendations\n")

# Model Performance Summary
print("1. Model Performance:")
print("\nClaim Severity Models:")
print(results_df[['RMSE', 'R2']])

print("\nClaim Probability Models:")
print(results_df[['RMSE', 'R2']])

# Feature Importance Summary
print("\n2. Key Risk Factors:")
print("Based on SHAP analysis, the most important features for predicting claims are:")
# Add top features from SHAP analysis

# Premium Optimization Summary
print("\n3. Premium Optimization:")
print("The optimized premium model suggests:")
print(f"Average Actual Premium: ${df['TotalPremium'].mean():,.2f}")
print(f"Average Optimized Premium: ${df['optimized_premium'].mean():,.2f}")
print(f"Premium Difference: ${df['optimized_premium'].mean() - df['TotalPremium'].mean():,.2f}")

# Business Recommendations
print("\n4. Business Recommendations:")
print("1. Implement risk-based pricing using the optimized premium model")
print("2. Focus on high-risk factors identified in the SHAP analysis")
print("3. Consider adjusting pricing based on claim probability and severity predictions")
print("4. Monitor model performance and update regularly with new data")