# How can we build an accurate machine learning model to classify imbalanced data where one class has significantly fewer samples than the other?

In [3]:
# 📦 Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, roc_auc_score, roc_curve,
    precision_score, recall_score, f1_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [4]:
# 📄 Load Data
df = pd.read_csv(r"C:\Users\tajud\OneDrive\Desktop\NRIT\sudents and projects\S1\Classification\Insurance-claims-data\Insurance claims data.csv")

In [7]:
# 🧹 Preprocessing
df.drop(columns=['policy_id', 'transmission_type', 'rear_brakes_type'], inplace=True)

In [9]:
binary_map = {'Yes': 1, 'No': 0}
df = df.apply(lambda col: col.map(binary_map) if col.nunique() == 2 and col.dtypes == 'object' else col)

df['max_power'] = df['max_power'].apply(lambda x: float(re.findall(r'[\d.]+', x)[0]) if isinstance(x, str) else np.nan)
df['max_torque'] = df['max_torque'].apply(lambda x: float(re.findall(r'[\d.]+', x)[0]) if isinstance(x, str) else np.nan)

df = pd.get_dummies(df, columns=[
    'region_code', 'segment', 'model', 'fuel_type', 'engine_type', 'steering_type'
], drop_first=True)

df.dropna(inplace=True)

In [11]:
# 🎯 Target & Features
X = df.drop('claim_status', axis=1)
y = df['claim_status']

In [13]:
# ⚖️ SMOTE + Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X , y, stratify=y, test_size=0.2, random_state=42)
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)

In [15]:
# 📏 Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
# 🤖 Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [None]:
# 📊 Training, Evaluation, and Plotting
results = []
plt.figure(figsize=(8, 6))

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    # Metrics
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    
    results.append({
        "Model": name,
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1-Score": round(f1, 4),
        "ROC-AUC": round(auc, 4)
    })

    # Report and ROC Curve
    print(f"\n{name} Report:")
    print(classification_report(y_test, preds))

    fpr, tpr, _ = roc_curve(y_test, probs)
    plt.plot(fpr, tpr, label=name)



Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.95      0.60      0.74     10969
           1       0.09      0.58      0.16       750

    accuracy                           0.60     11719
   macro avg       0.52      0.59      0.45     11719
weighted avg       0.90      0.60      0.70     11719


Random Forest Report:
              precision    recall  f1-score   support

           0       0.94      0.90      0.92     10969
           1       0.08      0.13      0.10       750

    accuracy                           0.85     11719
   macro avg       0.51      0.52      0.51     11719
weighted avg       0.88      0.85      0.87     11719


Naive Bayes Report:
              precision    recall  f1-score   support

           0       0.94      0.55      0.70     10969
           1       0.07      0.52      0.13       750

    accuracy                           0.55     11719
   macro avg       0.51      0.54      0.41     11719
w

In [None]:
# 📋 Results Table
results_df = pd.DataFrame(results)
print("\nModel Comparison Table:")
print(results_df)