In [4]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# 📥 Load dataset
df = pd.read_csv('../data/raw/insurance.csv')

# 🎯 Encode target column
df['fraud_reported'] = df['fraud_reported'].map({'Y': 1, 'N': 0})

# ➕ Create numeric feature (optional)
df['claim_ratio'] = df['total_claim_amount'] / (df['age'] + 1e-5)  # prevent divide-by-zero

# ✅ Select features (only numeric)
X = df.drop(columns=['fraud_reported']).select_dtypes(include='number')
y = df['fraud_reported']

print(f"✅ Using {X.shape[1]} numeric features")

# ✂️ Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🤖 Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 📊 Evaluate
y_pred = model.predict(X_test)
print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("📋 Classification Report:\n", classification_report(y_test, y_pred))

# 💾 Save model
joblib.dump(model, '../models/fraud_model.pkl')
print("✅ Model saved to ../models/fraud_model.pkl")


✅ Using 20 numeric features
📊 Confusion Matrix:
 [[145   0]
 [ 54   1]]
📋 Classification Report:
               precision    recall  f1-score   support

           0       0.73      1.00      0.84       145
           1       1.00      0.02      0.04        55

    accuracy                           0.73       200
   macro avg       0.86      0.51      0.44       200
weighted avg       0.80      0.73      0.62       200

✅ Model saved to ../models/fraud_model.pkl
