In [None]:
pipeline_lr_severity = Pipeline(steps=[('preprocessor', preprocessor_severity),
                                    ('regressor', LinearRegression())])
pipeline_lr_severity.fit(X_train_severity, y_train_severity)

In [None]:
pipeline_rf_severity = Pipeline(steps=[('preprocessor', preprocessor_severity),
                                    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))])
pipeline_rf_severity.fit(X_train_severity, y_train_severity)

In [None]:
pipeline_xgb_severity = Pipeline(steps=[('preprocessor', preprocessor_severity),
                                     ('regressor', xgb.XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))])
pipeline_xgb_severity.fit(X_train_severity, y_train_severity)

In [None]:
pipeline_lr_prob = Pipeline(steps=[('preprocessor', preprocessor_prob),
                                ('classifier', LogisticRegression(random_state=42, solver='liblinear'))])
pipeline_lr_prob.fit(X_train_prob, y_train_prob)

In [None]:
pipeline_rf_prob = Pipeline(steps=[('preprocessor', preprocessor_prob),
                                ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))])
pipeline_rf_prob.fit(X_train_prob, y_train_prob)

In [None]:
pipeline_xgb_prob = Pipeline(steps=[('preprocessor', preprocessor_prob),
                                 ('classifier', xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42, n_jobs=-1))])
pipeline_xgb_prob.fit(X_train_prob, y_train_prob)

In [None]:
models_severity = {
    "Linear Regression": pipeline_lr_severity,
    "Random Forest Regressor": pipeline_rf_severity,
    "XGBoost Regressor": pipeline_xgb_severity
}

results_severity = {}
for name, model in models_severity.items():
    y_pred = model.predict(X_test_severity)
    rmse = np.sqrt(mean_squared_error(y_test_severity, y_pred))
    r2 = r2_score(y_test_severity, y_pred)
    results_severity[name] = {'RMSE': rmse, 'R2': r2}
    print(f"{name} - RMSE: {rmse:.2f}, R2: {r2:.2f}")

# You can then convert results_severity to a DataFrame for a nice comparison table
# pd.DataFrame(results_severity).T

In [None]:
models_prob = {
    "Logistic Regression": pipeline_lr_prob,
    "Random Forest Classifier": pipeline_rf_prob,
    "XGBoost Classifier": pipeline_xgb_prob
}

results_prob = {}
for name, model in models_prob.items():
    y_pred = model.predict(X_test_prob)
    y_pred_proba = model.predict_proba(X_test_prob)[:, 1] # Probability of positive class

    accuracy = accuracy_score(y_test_prob, y_pred)
    precision = precision_score(y_test_prob, y_pred)
    recall = recall_score(y_test_prob, y_pred)
    f1 = f1_score(y_test_prob, y_pred)
    roc_auc = roc_auc_score(y_test_prob, y_pred_proba)

    results_prob[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }
    print(f"\n{name} - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}, ROC-AUC: {roc_auc:.2f}")

# pd.DataFrame(results_prob).T

In [None]:
# --- For the best severity model (e.g., XGBoost Regressor) ---
best_severity_model = models_severity["XGBoost Regressor"].named_steps['regressor']
# Transform data using the preprocessor used in the pipeline
X_train_severity_transformed = models_severity["XGBoost Regressor"].named_steps['preprocessor'].fit_transform(X_train_severity)
X_test_severity_transformed = models_severity["XGBoost Regressor"].named_steps['preprocessor'].transform(X_test_severity)

# Get feature names after one-hot encoding
ohe_feature_names = models_severity["XGBoost Regressor"].named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(severity_categorical_features)
all_feature_names_severity = severity_numerical_features + ohe_feature_names.tolist()

# Create a DataFrame for transformed data with proper column names
X_train_severity_transformed_df = pd.DataFrame(X_train_severity_transformed, columns=all_feature_names_severity)
X_test_severity_transformed_df = pd.DataFrame(X_test_severity_transformed, columns=all_feature_names_severity)


explainer_severity = shap.TreeExplainer(best_severity_model)
shap_values_severity = explainer_severity.shap_values(X_test_severity_transformed_df)

# Plot global feature importances (SHAP summary plot)
shap.summary_plot(shap_values_severity, X_test_severity_transformed_df, plot_type="bar", show=False)
plt.title("SHAP Feature Importance for Claim Severity Prediction")
plt.tight_layout()
plt.savefig("../notebook/severity_shap_summary_bar.png") # Save plot
plt.show()

shap.summary_plot(shap_values_severity, X_test_severity_transformed_df, show=False) # For individual impacts
plt.title("SHAP Feature Impacts for Claim Severity Prediction")
plt.tight_layout()
plt.savefig("../notebook/severity_shap_summary_dot.png") # Save plot
plt.show()

# --- Repeat for the best probability model (e.g., XGBoost Classifier) ---
best_prob_model = models_prob["XGBoost Classifier"].named_steps['classifier']
X_train_prob_transformed = models_prob["XGBoost Classifier"].named_steps['preprocessor'].fit_transform(X_train_prob)
X_test_prob_transformed = models_prob["XGBoost Classifier"].named_steps['preprocessor'].transform(X_test_prob)

ohe_feature_names_prob = models_prob["XGBoost Classifier"].named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(prob_categorical_features)
all_feature_names_prob = prob_numerical_features + ohe_feature_names_prob.tolist()

X_train_prob_transformed_df = pd.DataFrame(X_train_prob_transformed, columns=all_feature_names_prob)
X_test_prob_transformed_df = pd.DataFrame(X_test_prob_transformed, columns=all_feature_names_prob)


explainer_prob = shap.TreeExplainer(best_prob_model)
# For classification, shap_values might return an array for each class; take the one for the positive class (index 1)
shap_values_prob = explainer_prob.shap_values(X_test_prob_transformed_df)[1] # Take SHAP values for class 1

shap.summary_plot(shap_values_prob, X_test_prob_transformed_df, plot_type="bar", show=False)
plt.title("SHAP Feature Importance for Claim Probability Prediction")
plt.tight_layout()
plt.savefig("../notebook/prob_shap_summary_bar.png") # Save plot
plt.show()

shap.summary_plot(shap_values_prob, X_test_prob_transformed_df, show=False)
plt.title("SHAP Feature Impacts for Claim Probability Prediction")
plt.tight_layout()
plt.savefig("../notebook/prob_shap_summary_dot.png") # Save plot
plt.show()