In [None]:
import sys
import os

In [None]:
sys.path.append(os.path.abspath(".."))

In [None]:
# Imports Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

# Load your preprocessed data
from scripts.data_preprocessing import load_and_prepare_data, prepare_data_for_claim_probability


In [None]:
df_raw = pd.read_csv("../data/raw/insurance_data.txt", sep="|", low_memory=False)

In [None]:
# Pass raw data to claim probability prep
X, y = prepare_data_for_claim_probability(df_raw)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
#Train random forest and XGBoost models
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_probs = rf.predict_proba(X_test)[:, 1]

# XGBoost
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)
xgb_probs = xgb.predict_proba(X_test)[:, 1]


In [None]:
# Plot ROC curves comparison
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_probs)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label="Random Forest")
plt.plot(fpr_xgb, tpr_xgb, label="XGBoost")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Claim Probability Prediction")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Top 15 features by importance (XGBoost)
importance = xgb.feature_importances_
features = pd.Series(importance, index=X.columns).sort_values(ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(x=features.values, y=features.index)
plt.title("Top 15 Important Features (XGBoost)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()