In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import joblib
import numpy as np


In [2]:
# Load the dataset
url = "https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Insurance%20Claim%20Fraud%20Detection/Automobile_insurance_fraud.csv"
data = pd.read_csv(url, header=None)

In [3]:
data.columns = [
    'months_as_customer', 'age', 'policy_number', 'policy_bind_date', 'policy_state',
    'policy_csl', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
    'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation',
    'insured_hobbies', 'insured_relationship', 'capital-gains', 'capital-loss',
    'incident_date', 'incident_type', 'collision_type', 'incident_severity',
    'authorities_contacted', 'incident_state', 'incident_city', 'incident_location',
    'incident_hour_of_the_day', 'number_of_vehicles_involved', 'property_damage',
    'bodily_injuries', 'witnesses', 'police_report_available', 'total_claim_amount',
    'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make', 'auto_model',
    'auto_year', 'fraud_reported'
]

In [4]:
data.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,17-10-2014,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,27-06-2006,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,06-09-2000,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,25-05-1990,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,06-06-2014,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [5]:
# Check for the presence of the '_c39' column and drop it if it exists
if '_c39' in data.columns:
    data = data.drop(columns=['_c39'])

In [6]:
data.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [7]:
# Preprocess the dataset
# Handle missing values using simple imputation (can be improved)
data = data.ffill().bfill()

In [8]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [9]:
# Task 1: Predict whether the claim is fraudulent (classification)
X_classification = data.drop(['fraud_reported'], axis=1)
y_classification = data['fraud_reported']

# Split the dataset into training and testing sets for classification
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)

# Standardize the feature variables
scaler_cls = StandardScaler()
X_train_cls = scaler_cls.fit_transform(X_train_cls)
X_test_cls = scaler_cls.transform(X_test_cls)

# Train the Logistic Regression model for classification
model_cls = LogisticRegression()
model_cls.fit(X_train_cls, y_train_cls)

# Make predictions on the testing set for classification
y_pred_cls = model_cls.predict(X_test_cls)

# Evaluate the classification model
accuracy_cls = accuracy_score(y_test_cls, y_pred_cls)
report_cls = classification_report(y_test_cls, y_pred_cls)

print("Classification Model Accuracy: {:.2f}%".format(accuracy_cls * 100))
print("Classification Report:")
print(report_cls)

Classification Model Accuracy: 70.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.88      0.81       145
           1       0.41      0.22      0.29        55

    accuracy                           0.70       200
   macro avg       0.58      0.55      0.55       200
weighted avg       0.66      0.70      0.67       200



In [10]:
# Task 2: Predict the total claim amount (regression)
# Prepare data for regression (excluding rows with zero total_claim_amount if any)
X_regression = data.drop(['total_claim_amount'], axis=1)
y_regression = data['total_claim_amount'].astype(float)

# Split the dataset into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)

# Standardize the feature variables
scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)

# Train the Linear Regression model for regression
model_reg = LinearRegression()
model_reg.fit(X_train_reg, y_train_reg)

# Make predictions on the testing set for regression
y_pred_reg = model_reg.predict(X_test_reg)

# Evaluate the regression model
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
rmse_reg = mse_reg ** 0.5

print("Regression Model RMSE: {:.2f}".format(rmse_reg))

Regression Model RMSE: 0.00


In [11]:
# Save the trained models and preprocessing objects
joblib.dump(model_cls, 'logistic_regression_fraud_model.pkl')
joblib.dump(scaler_cls, 'scaler_cls.pkl')
joblib.dump(model_reg, 'linear_regression_claim_model.pkl')
joblib.dump(scaler_reg, 'scaler_reg.pkl')
for column, encoder in label_encoders.items():
    joblib.dump(encoder, f'label_encoder_{column}.pkl')

print("Models and preprocessing objects saved.")

Models and preprocessing objects saved.
