In [None]:
!pip install pandas-profiling==3.4.0

In [None]:
import warnings
warnings.filterwarnings("error", message=".*check_inverse*.",category=UserWarning, append=False)

**Load Dataset**

In [None]:
# Data profiling
import pandas as pd
import pandas_profiling as pp
df = pd.read_csv('/content/insurance_claims.csv')
profile = pp.ProfileReport(df)
profile

**Data Preprocessing - Kibtia Chowdhury**

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/insurance_claims.csv')
#print(data)

df.head() # display the first few rows
df.info() # structure of the dataset

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA

df = pd.read_csv('/content/insurance_claims.csv')

imputer = SimpleImputer(strategy='mean') # Fill missing values with mean
numeric_columns = ['months_as_customer', 'age', 'policy_number', 'policy_deductable', 'policy_annual_premium',
                   'umbrella_limit', 'insured_zip', 'incident_hour_of_the_day',
                   'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'total_claim_amount',
                   'injury_claim', 'property_claim', 'vehicle_claim', 'auto_year']
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
df.drop_duplicates(inplace=True) # Removing duplicate records

df['incident_severity'].replace({'Major Damage': 'Major', 'Minor Damage': 'Minor'}, inplace=True) # Correct the inconsistent values to a standard format

# Data Reduction
# Feature selection
selected_features = ['age', 'insured_education_level', 'insured_occupation', 'incident_type', 'collision_type',
                     'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city',
                     'auto_make', 'auto_model', 'fraud_reported']
df = df[selected_features]

scaler = MinMaxScaler() # Scaling and normalization
numeric_columns = ['age']
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
categorical_columns = ['insured_education_level', 'insured_occupation', 'incident_type', 'collision_type', # Encoding categorical variables
                       'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city',
                       'auto_make', 'auto_model']
encoder = OneHotEncoder()
encoded_features = pd.DataFrame(encoder.fit_transform(df[categorical_columns]).toarray(),
                                columns=encoder.get_feature_names_out(categorical_columns))
df.drop(categorical_columns, axis=1, inplace=True)
df = pd.concat([df, encoded_features], axis=1)
print(df.head())

In [None]:
data = df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import math

def plotPerColumnDistribution(data):
    num_columns = len(data.columns)
    num_rows = math.ceil(num_columns / 3)

    plt.figure(figsize=(12, 4*num_rows))
    plt.subplots_adjust(hspace=0.5)

    for i, column in enumerate(data.columns):
        plt.subplot(num_rows, 3, i+1)
        sns.histplot(data[column], kde=True)
        plt.title(column)
        plt.xlabel('')

    plt.tight_layout()
    plt.show()

plotPerColumnDistribution(data)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Feature Engineering and Preprocessing
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols = categorical_cols.drop('fraud_reported')

# Label encoding
label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Separating Features and Target
X = df.drop('fraud_reported', axis=1)
y = label_encoder.fit_transform(df['fraud_reported'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the information of the split datasets
print("\t\t\t---Split into Train-Test---")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# **Random Forest**




In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
rf_y_pred = rf_classifier.predict(X_test)
rf_model_train_acc = accuracy_score(y_train, rf_classifier.predict(X_train))
print ("Random Forest Train Accuracy: ", rf_model_train_acc*100)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Random Forest Test Accuracy:", rf_accuracy*100)

confusion_mat = confusion_matrix(y_test, rf_y_pred)
print("Confusion Matrix:")
print(confusion_mat)

labels = ['Not Fraud', 'Fraud']
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)

plt.title('Confusion Matrix - Random Forest Classifier')
plt.xlabel('Predicted Class')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.metrics import precision_score #printing precision and recall only
from sklearn.metrics import recall_score
precision_rf = precision_score(y_test, rf_y_pred)
recall_rf = recall_score(y_test, rf_y_pred)
print('Precision: ',precision_rf *100)
print('Recall: ',recall_rf *100)

In [None]:
print(confusion_matrix(y_test, rf_y_pred))
print(classification_report(y_test, rf_y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = 5
seeds = range(10)
scores = []

for i in seeds:
    kf = KFold(n_splits=kfold, random_state=i, shuffle=True).split(X)
    rf_classifier = RandomForestClassifier()
    scores.append(cross_val_score(rf_classifier, X, y, cv=kf).mean())

print('Cross Validation Mean Accuracy with fold=5:', np.mean(scores))
print('Standard Deviation with fold=5:', np.std(scores))

# **XGB**

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
xgb_model_train_acc = accuracy_score(y_train, xgb_classifier.predict(X_train))
print ("XGBoost Train Accuracy: ", xgb_model_train_acc*100)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Test Accuracy:", accuracy_xgb*100)

confusion_mat_xgb = confusion_matrix(y_test, y_pred_xgb)
print("Confusion Matrix - XGBoost:")
print(confusion_mat_xgb)

labels = ['Not Fraud', 'Fraud']
sns.heatmap(confusion_mat_xgb, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix - XGBoost Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

In [None]:
from sklearn.metrics import precision_score #printing precision and recall only
from sklearn.metrics import recall_score

precision_xbg = precision_score(y_test, y_pred_xgb) #printing precision and recall only
recall_xgb = recall_score(y_test, y_pred_xgb)
print('Precision: ',precision_xbg * 100)
print('Recall: ',recall_xgb * 100)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = 5
seeds = range(10)
scores = []

for i in seeds:
    kf = KFold(n_splits=kfold, random_state=i, shuffle=True).split(X)
    xgb_classifier = XGBClassifier()
    scores.append(cross_val_score(xgb_classifier, X, y, cv=kf).mean())

print('Cross Validation Mean Accuracy with fold=5:', np.mean(scores))
print('Standard Deviation with fold=5:', np.std(scores))

# ***MLP***



In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns

mlp_classifier = MLPClassifier()
mlp_classifier.fit(X_train, y_train)
y_pred_mlp = mlp_classifier.predict(X_test)
mlp_model_train_acc = accuracy_score(y_train, mlp_classifier.predict(X_train))
print ("MLP Train Accuracy: ", mlp_model_train_acc*100)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print("MLP Accuracy:", accuracy_mlp*100)

confusion_mat_mlp = confusion_matrix(y_test, y_pred_mlp)
print("Confusion Matrix - MLP:")
print(confusion_mat_mlp)

labels = ['Not Fraud', 'Fraud']
sns.heatmap(confusion_mat_mlp, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix - MLP Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.metrics import precision_score #printing precision and recall only
from sklearn.metrics import recall_score

precision_mlp = precision_score(y_test, y_pred_mlp) #printing just precision and recall for 1
recall_mlp = recall_score(y_test, y_pred_mlp)
print('Precision: ',precision_mlp*100)
print('Recall: ',recall_mlp*100)

In [None]:
print(confusion_matrix(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import numpy as np
kfold = 5
seeds = range(10)
scores = []

for i in seeds:
    kf = KFold(n_splits=kfold, random_state=i, shuffle=True).split(X)
    mlp_classifier = MLPClassifier()
    scores.append(cross_val_score(mlp_classifier, X, y, cv=kf).mean())

print('Cross Validation Mean Accuracy with fold=5:', np.mean(scores))
print('Standard Deviation with fold=5:', np.std(scores))

# ***CART***



In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier()

decision_tree_model.fit(X_train, y_train)
y_pred_dt = decision_tree_model.predict(X_test)
decision_tree_model_train_acc = accuracy_score(y_train, decision_tree_model.predict(X_train))
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print ("DT Train Accuracy: ", decision_tree_model_train_acc*100)
print("DT Accuracy:", accuracy_dt*100)


confusion_mat_dt = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix - CART:")
print(confusion_mat_dt)

sns.heatmap(confusion_mat_dt, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix - DT Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

In [None]:
precision_dt = precision_score(y_test, y_pred_dt) #printing just precision and recall for 1
recall_dt = recall_score(y_test, y_pred_dt)
print('Precision: ',precision_dt*100)
print('Recall: ',recall_dt*100)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = 5
seeds = range(10)
scores = []

for i in seeds:
    kf = KFold(n_splits=kfold, random_state=i, shuffle=True).split(X)
    dt_classifier = DecisionTreeClassifier()
    scores.append(cross_val_score(dt_classifier, X, y, cv=kf).mean())
    #scores.append(cross_val_score(mlp_classifier, X, y, cv=kf).mean())

print('Cross Validation Mean Accuracy with fold=5:', np.mean(scores))
print('Standard Deviation with fold=5:', np.std(scores))

**Comparison- All Members**

In [None]:
import matplotlib.pyplot as plt

accuracy_scores = [rf_accuracy, accuracy_xgb, accuracy_mlp, accuracy_dt]
algorithms = ['Random Forest', 'XGBoost', 'MLP', 'CART']

plt.bar(algorithms, accuracy_scores)
plt.xlabel('Algorithms')
plt.ylabel('Accuracy')
plt.title('Comparison of Algorithm Accuracies')
plt.ylim([0, 1])
plt.show()

In [None]:

import numpy as np
import matplotlib.pyplot as plt

X = ['Random Forest', 'XGBoost', 'MLP', 'CART']
recall_scores = [ recall_rf, recall_xgb, recall_mlp, recall_dt ]
precision_Scores = [ precision_rf, precision_xbg, precision_mlp, precision_dt]

X_axis = np.arange(len(X))

plt.bar(X_axis - 0.2, recall_scores, 0.4, label = 'Recall')
plt.bar(X_axis + 0.2, precision_Scores, 0.4, label = 'Precision')

plt.xticks(X_axis, X)
plt.xlabel("Algorithms")
plt.ylabel("Recall and Precision Scores")
plt.title("Comparision")
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

recall_scores = [ recall_rf, recall_xgb, recall_mlp, recall_dt]
algorithms = ['Random Forest', 'XGBoost', 'MLP', 'DT']

plt.bar(algorithms, recall_scores)
plt.xlabel('Algorithms')
plt.ylabel('Recall')
plt.title('Comparison of Recall%')
plt.ylim([0, 1])
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier


# Random Forest
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf_y_pred)
auc_rf = auc(fpr_rf, tpr_rf)

# MLP
fpr_mlp, tpr_mlp, thresholds_mlp = roc_curve(y_test, y_pred_mlp)
auc_mlp = auc(fpr_mlp, tpr_mlp)

# XGBoost
fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_test, y_pred_xgb)
auc_xgb = auc(fpr_xgb, tpr_xgb)

# CART
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, y_pred_dt)
auc_dt = auc(fpr_dt, tpr_dt)

# Plot the AUC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.2f})')
plt.plot(fpr_mlp, tpr_mlp, label=f'MLP (AUC = {auc_mlp:.2f})')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.2f})')
plt.plot(fpr_dt, tpr_dt, label=f'CART (AUC = {auc_dt:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()