In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
import joblib
import os

# Loading the dataset
df = pd.read_csv("../data/data_with_anomalies.csv")

# Defining features and label
X = df.drop(['claim_status', 'anomaly_flag'], axis=1)
y = df['claim_status']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handling the class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_scaled, y)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)



In [3]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)
lr_f1 = f1_score(y_test, lr_preds)
print("Logistic Regression Results")
print(classification_report(y_test, lr_preds))



Logistic Regression Results
              precision    recall  f1-score   support

           0       0.60      0.56      0.58     11072
           1       0.58      0.61      0.60     10866

    accuracy                           0.59     21938
   macro avg       0.59      0.59      0.59     21938
weighted avg       0.59      0.59      0.59     21938



In [4]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)
print("Random Forest Results")
print(classification_report(y_test, rf_preds))


Random Forest Results
              precision    recall  f1-score   support

           0       0.92      0.92      0.92     11072
           1       0.92      0.92      0.92     10866

    accuracy                           0.92     21938
   macro avg       0.92      0.92      0.92     21938
weighted avg       0.92      0.92      0.92     21938



In [8]:
# XGBoost
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_preds)
xgb_f1 = f1_score(y_test, xgb_preds)
print("XGBoost Results")
print(classification_report(y_test, xgb_preds))


XGBoost Results
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     11072
           1       0.98      0.87      0.92     10866

    accuracy                           0.93     21938
   macro avg       0.93      0.93      0.93     21938
weighted avg       0.93      0.93      0.93     21938



In [6]:
# Model comparison
print(" Model Comparison:")
print(f"Logistic Regression - Accuracy: {lr_acc:.4f}, F1 Score: {lr_f1:.4f}")
print(f"Random Forest       - Accuracy: {rf_acc:.4f}, F1 Score: {rf_f1:.4f}")
print(f"XGBoost             - Accuracy: {xgb_acc:.4f}, F1 Score: {xgb_f1:.4f}")


 Model Comparison:
Logistic Regression - Accuracy: 0.5885, F1 Score: 0.5959
Random Forest       - Accuracy: 0.9187, F1 Score: 0.9179
XGBoost             - Accuracy: 0.9285, F1 Score: 0.9235


In [7]:
# Finding the best model
best_model = max([
    ("Logistic Regression", lr_f1),
    ("Random Forest", rf_f1),
    ("XGBoost", xgb_f1)
], key=lambda x: x[1])

print(f"\n Best Model Based on F1 Score: {best_model[0]} ({best_model[1]:.4f})")


 Best Model Based on F1 Score: XGBoost (0.9235)
