In [1]:
## Setup
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dvc.api

# Add project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import functions from task4_modeling
from src.modeling import (prepare_data, build_models, evaluate_models, interpret_model)

# Set plot style
plt.style.use('seaborn-v0_8')

# Define data paths
cleaned_data_path = '../data/cleaned_data.csv'
dvc_file = '../data/cleaned_data.csv.dvc'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Data Preparation
print("Starting data preparation...")
X_train_sev, X_test_sev, y_train_sev, y_test_sev, X_train_prob, X_test_prob, y_train_prob, y_test_prob = prepare_data(cleaned_data_path, dvc_file)
print("Data preparation completed.")


Starting data preparation...


  df['ClaimSeverity'] = df['ClaimSeverity'].fillna(0)


ValueError: No valid data left for severity modeling after cleaning.

In [None]:
## Model Building
print("Building models...")
models_sev, models_prob = build_models(X_train_sev, y_train_sev, X_train_prob, y_train_prob)
print("Model building completed.")



Building models...


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
## Model Evaluation
print("Evaluating models...")
results_sev, results_prob = evaluate_models(models_sev, models_prob, X_test_sev, y_test_sev, X_test_prob, y_test_prob)



Evaluating models...


NameError: name 'models_sev' is not defined

In [None]:
# Print results
print("\n=== Claim Severity Model Results ===")
for name, result in results_sev.items():
    print(f"{name}: RMSE = {result['RMSE']:.2f}, R² = {result['R²']:.4f}")

print("\n=== Claim Probability Model Results ===")
for name, result in results_prob.items():
    print(f"{name}: Accuracy = {result['Accuracy']:.4f}, Precision = {result['Precision']:.4f}, "
          f"Recall = {result['Recall']:.4f}, F1 = {result['F1']:.4f}")



In [None]:
# Comparison
best_sev_model = max(results_sev, key=lambda k: results_sev[k]['R²'])
best_prob_model = max(results_prob, key=lambda k: results_prob[k]['F1'])
print(f"\n=== Model Comparison ===")
print(f"Best Claim Severity Model (by R²): {best_sev_model}")
print(f"Best Claim Probability Model (by F1): {best_prob_model}")



In [None]:
## Model Interpretability
print("Performing model interpretability...")
# Interpret best severity model
best_sev = models_sev[best_sev_model]
interpret_model(best_sev, X_train_sev, X_test_sev, 'severity')

# Interpret best probability model
best_prob = models_prob[best_prob_model]
interpret_model(best_prob, X_train_prob, X_test_prob, 'probability')