# Feature Engineering

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, recall_score, f1_score, accuracy_score

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier

import joblib

In [10]:
df = pd.read_csv('../data/fraud_oracle_Formatted.csv')
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [11]:
col_ordering = [
    {'col':'AccidentArea','mapping':{'Urban':1, 'Rural':0}},
    {'col':'Sex','mapping':{'Female':1, 'Male':0}},
    {'col':'Fault','mapping':{'Policy Holder':1, 'Third Party':0}},
    {'col':'PoliceReportFiled','mapping':{'Yes':1, 'No':0}},
    {'col':'WitnessPresent','mapping':{'Yes':1, 'No':0}},
    {'col':'AgentType','mapping':{'External':1, 'Internal':0}},
    {'col':'Month','mapping':{'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}},
    {'col':'DayOfWeek','mapping':{'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}},
    {'col':'DayOfWeekClaimed','mapping':{'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}},
    {'col':'MonthClaimed','mapping':{'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}},
    {'col':'PastNumberOfClaims','mapping':{'none':0 ,'1':1,'2 to 4':2,'more than 4':3 }},
    {'col':'NumberOfSuppliments','mapping':{'none':0,'1 to 2':1,'3 to 5':2,'more than 5':3}}, 
    {'col':'VehiclePrice','mapping':{'less than 20000':0,'20000 to 29000':1,'30000 to 39000':2,
                                     '40000 to 59000':3,'60000 to 69000':4,'more than 69000':5}},
    {'col':'AgeOfVehicle','mapping':{'3 years':3,'6 years':6,'7 years':7,'more than 7':8,'5 years':5,'new':0,'4 years':4,'2 years':2}},
    {'col':'Days_Policy_Accident','mapping':{'more than 30':4,'15 to 30':3,'none':0,'1 to 7':1,'8 to 15':2}},
    {'col':'Days_Policy_Claim','mapping':{'more than 30':4,'15 to 30':3,'none':0,'1 to 7':1,'8 to 15':2}},
    {'col':'AgeOfPolicyHolder','mapping':{'16 to 17':1,'18 to 20':2,'21 to 25':3,'26 to 30':4,'31 to 35':5,'36 to 40':6,
                                          '41 to 50':7,'51 to 65':8,'over 65':9}},
    {'col':'AddressChange_Claim','mapping':{'no change':0,'under 6 months':1,'1 year':2,'2 to 3 years':3,'4 to 8 years':4}},
    {'col':'NumberOfCars','mapping':{'1 vehicle':1,'2 vehicles':2,'3 to 4':3,'5 to 8':4,'more than 8':5}}
]
ord_encoder = OrdinalEncoder(mapping = col_ordering, return_df=True)

In [12]:
df_ord_encoder = ord_encoder.fit_transform(df)

In [13]:
OHE = OneHotEncoder(cols = ['Make','MaritalStatus','VehicleCategory','BasePolicy'],use_cat_names=True, return_df=True) 
one_encoder_df = OHE.fit_transform(df_ord_encoder)


In [14]:
new_df = one_encoder_df.drop(columns=['Days_Policy_Claim','DayOfWeek','WitnessPresent','WeekOfMonthClaimed','DayOfWeekClaimed','DriverRating','WeekOfMonth','NumberOfCars','RepNumber'])

In [15]:
new_df.head()

Unnamed: 0,Month,Make_Honda,Make_Toyota,Make_Ford,Make_Mazda,Make_Chevrolet,Make_Pontiac,Make_Accura,Make_Dodge,Make_Mercury,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,AgentType,NumberOfSuppliments,AddressChange_Claim,Year,BasePolicy_Liability,BasePolicy_Collision,BasePolicy_All Perils
0,12,1,0,0,0,0,0,0,0,0,...,3,4,0,1,0,2,1994,1,0,0
1,1,1,0,0,0,0,0,0,0,0,...,6,5,1,1,0,0,1994,0,1,0
2,10,1,0,0,0,0,0,0,0,0,...,7,7,0,1,0,0,1994,0,1,0
3,6,0,1,0,0,0,0,0,0,0,...,8,8,1,1,3,0,1994,1,0,0
4,1,1,0,0,0,0,0,0,0,0,...,5,5,0,1,0,0,1994,0,1,0


# Data Splitting

In [16]:
X = new_df.drop(columns='FraudFound_P')
y = new_df['FraudFound_P']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48, stratify=y)


# Modelling

## Five Base Models (DT, MLP, LR, NB, KNN)

In [None]:
def evaluate_classifier(name, classifier, X_train, y_train, X_test, y_test):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
   

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    true_negative = conf_matrix[0, 0]
    false_positive = conf_matrix[0, 1]
    specificity = true_negative / (true_negative + false_positive)
    f1 = f1_score(y_test, y_pred)

    print("#" * 50)
    print(f"{name} - Results ")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("Confusion Matrix:")
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    print()

# Decision Tree
dt_classifier = DecisionTreeClassifier(random_state=42)
evaluate_classifier("Decision Tree", dt_classifier, X_train, y_train, X_test, y_test)

# KNN
knn = KNeighborsClassifier(n_neighbors=3)
evaluate_classifier("KNN", knn, X_train, y_train, X_test, y_test)

# Naive Bayes
nb_classifier = GaussianNB()
evaluate_classifier("NB", nb_classifier, X_train, y_train, X_test, y_test)

# MLP
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
evaluate_classifier("MLP", mlp_classifier, X_train, y_train, X_test, y_test)

# Logistic Regression
lr_classifier = LogisticRegression(random_state=42)
evaluate_classifier("LR", lr_classifier, X_train, y_train, X_test, y_test)

## SMOTE & Random Undersampling (DT, NB & LR)

In [None]:
def apply_smote(X_train, y_train, X_test, y_test, oversample_percentage, classifier):
    class_0 = int(len(y_train[y_train == 0]))
    class_1 = int(len(y_train[y_train == 1]))

    smote = SMOTE(sampling_strategy={0: class_0, 1: round(class_0 * oversample_percentage)}, random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    classifier.fit(x_train_smote, y_train_smote)

    y_pred = classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    conf_matrix = confusion_matrix(y_test, y_pred)
    true_negative = conf_matrix[0, 0]
    false_positive = conf_matrix[0, 1]
    specificity = true_negative / (true_negative + false_positive)
    f1 = f1_score(y_test, y_pred)

    return accuracy, recall, specificity, f1

def apply_random_undersampling(X_train, y_train, X_test, y_test, undersample_percentage, classifier):
    class_0 = int(len(y_train[y_train == 0]))
    class_1 = int(len(y_train[y_train == 1]))

    undersampler = RandomUnderSampler(sampling_strategy={0: round(class_1 / undersample_percentage), 1: class_1}, random_state=42)
    x_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

    classifier.fit(x_train_undersampled, y_train_undersampled)

    y_pred = classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    conf_matrix = confusion_matrix(y_test, y_pred)
    true_negative = conf_matrix[0, 0]
    false_positive = conf_matrix[0, 1]
    specificity = true_negative / (true_negative + false_positive)
    f1 = f1_score(y_test, y_pred)

    return accuracy, recall, specificity, f1

def plot_results(percentages, accuracies, recalls, specificities, f1_scores, title):
    plt.plot(percentages, accuracies, marker='o', label='Accuracy')
    plt.plot(percentages, recalls, marker='o', label='Recall')
    plt.plot(percentages, specificities, marker='o', label='Specificity')
    plt.plot(percentages, f1_scores, marker='o', label='F1-Score')
    plt.title(title)
    plt.xlabel('Percentage')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# DT SMOTE
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
accuracies, recalls, specificities, f1_scores = [], [], [], []
dt_classifier = DecisionTreeClassifier(random_state=42)
for oversample_percentage in oversampling_percentages:
    accuracy, recall, specificity, f1 = apply_smote(X_train, y_train, X_test, y_test, oversample_percentage, dt_classifier)
    accuracies.append(accuracy)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)

plot_results(oversampling_percentages, accuracies, recalls, specificities, f1_scores, 'SMOTE Oversampling on Decision Tree')

# DT Under Sampling
undersampling_percentages = np.arange(0.1, 0.9, 0.1)
accuracies, recalls, specificities, f1_scores = [], [], [], []
for undersample_percentage in undersampling_percentages:
    accuracy, recall, specificity, f1 = apply_random_undersampling(X_train, y_train, X_test, y_test, undersample_percentage, dt_classifier)
    accuracies.append(accuracy)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)

plot_results(undersampling_percentages, accuracies, recalls, specificities, f1_scores, 'Random Under Sampling on Decision Tree')

# NB SMOTE
nb_classifier = GaussianNB()
for oversample_percentage in oversampling_percentages:
    accuracy, recall, specificity, f1 = apply_smote(X_train, y_train, X_test, y_test, oversample_percentage, nb_classifier)
    accuracies.append(accuracy)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)

plot_results(oversampling_percentages, accuracies, recalls, specificities, f1_scores, 'SMOTE Oversampling on Naive Bayes')

# NB Under Sampling
for undersample_percentage in undersampling_percentages:
    accuracy, recall, specificity, f1 = apply_random_undersampling(X_train, y_train, X_test, y_test, undersample_percentage, nb_classifier)
    accuracies.append(accuracy)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)

plot_results(undersampling_percentages, accuracies, recalls, specificities, f1_scores, 'Random Under Sampling on Naive Bayes')

# NB SMOTE
nb_classifier = LogisticRegression()
for oversample_percentage in oversampling_percentages:
    accuracy, recall, specificity, f1 = apply_smote(X_train, y_train, X_test, y_test, oversample_percentage, nb_classifier)
    accuracies.append(accuracy)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)

plot_results(oversampling_percentages, accuracies, recalls, specificities, f1_scores, 'SMOTE Oversampling on Logistic Regression')

# NB Under Sampling
for undersample_percentage in undersampling_percentages:
    accuracy, recall, specificity, f1 = apply_random_undersampling(X_train, y_train, X_test, y_test, undersample_percentage, nb_classifier)
    accuracies.append(accuracy)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)

plot_results(undersampling_percentages, accuracies, recalls, specificities, f1_scores, 'Random Under Sampling on Logistic Regression')



## Hybrid Sampling (SMOTE + Random Undersampler)

In [None]:
# DT with hybrid sampling
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
undersampling_percentages = np.arange(0.1, 0.9, 0.1)

results_df = pd.DataFrame(columns=['Oversampling', 'Undersampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

best_combination = None
best_recall = 0
best_accuracy = 0
best_specificity = 0
best_f1 = 0

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))

for oversample_percentage in oversampling_percentages:
    smote = SMOTE(sampling_strategy={0: class_0, 1: round(class_0 * oversample_percentage)}, random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
    class_1_first = int(len(y_train_smote[y_train_smote == 1]))

    for undersample_percentage in undersampling_percentages:
        undersampler = RandomUnderSampler(sampling_strategy={0: round(class_1 / undersample_percentage), 1: class_1_first}, random_state=42)
        x_train_hybrid, y_train_hybrid = undersampler.fit_resample(x_train_smote, y_train_smote)

        dt_classifier = DecisionTreeClassifier(random_state=42)
        dt_classifier.fit(x_train_hybrid, y_train_hybrid)

        y_pred = dt_classifier.predict(X_test)

        recall = recall_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)

        conf_matrix = confusion_matrix(y_test, y_pred)
        true_negative = conf_matrix[0, 0]
        false_positive = conf_matrix[0, 1]
        specificity = true_negative / (true_negative + false_positive)
        
        f1 = f1_score(y_test, y_pred)

        if recall > best_recall:
            best_recall = recall
            best_accuracy = accuracy
            best_specificity = specificity
            best_f1 = f1
            best_combination = (oversample_percentage, undersample_percentage)
        
        print(f'Oversampling Percentage: {oversample_percentage * 100}%')
        print(f'Undersampling Percentage: {undersample_percentage * 100}%')
        print(f'Recall: {recall:.4f}')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Specificity: {specificity:.4f}')
        print(f'F1-Score: {f1:.4f}')
        print('-' * 30)

        results_df = results_df.append({
            'Oversampling': oversample_percentage,
            'Undersampling': undersample_percentage,
            'Accuracy': accuracy,
            'Recall': recall,
            'Specificity': specificity,
            'F1-Score': f1
        }, ignore_index=True)

print(f'Best Combination: Oversample {best_combination[0] * 100}%, Undersample {best_combination[1] * 100}%')
print(f'Best Recall: {best_recall:.4f}')
print(f'Best Accuracy: {best_accuracy:.4f}')
print(f'Best Specificity: {best_specificity:.4f}')
print(f'Best F1: {best_f1:.4f}')

In [None]:
# NB HYBRID
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
undersampling_percentages = np.arange(0.1, 0.9, 0.1)

results_df = pd.DataFrame(columns=['Oversampling', 'Undersampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

best_combination = None
best_recall = 0
best_accuracy = 0
best_specificity = 0
best_f1 = 0

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))


for oversample_percentage in oversampling_percentages:
    smote = SMOTE(sampling_strategy={0: class_0, 1: round(class_0 * oversample_percentage)}, random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    class_1_first = int(len(y_train_smote[y_train_smote == 1]))

    for undersample_percentage in undersampling_percentages:
        undersampler = RandomUnderSampler(sampling_strategy={0: round(class_1 / undersample_percentage), 1: class_1_first}, random_state=42)
        x_train_hybrid, y_train_hybrid = undersampler.fit_resample(x_train_smote, y_train_smote)
        nb_classifier = GaussianNB()
        nb_classifier.fit(x_train_hybrid, y_train_hybrid)
        y_pred = nb_classifier.predict(X_test)

        recall = recall_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)

        conf_matrix = confusion_matrix(y_test, y_pred)
        true_negative = conf_matrix[0, 0]
        false_positive = conf_matrix[0, 1]
        specificity = true_negative / (true_negative + false_positive)

        f1 = f1_score(y_test, y_pred)

        if recall > best_recall:
            best_recall = recall
            best_accuracy = accuracy
            best_specificity = specificity
            best_f1 = f1_score(y_test, y_pred)
            best_combination = (oversample_percentage, undersample_percentage)

        results_df = results_df.append({
            'Oversampling': oversample_percentage,
            'Undersampling': undersample_percentage,
            'Accuracy': accuracy,
            'Recall': recall,
            'Specificity': specificity,
            'F1-Score': f1
        }, ignore_index=True)

        print(f'Oversampling Percentage: {oversample_percentage * 100}%')
        print(f'Undersampling Percentage: {undersample_percentage * 100}%')
        print(f'Recall: {recall:.4f}')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Specificity: {specificity:.4f}')
        print(f'F1-Score: {f1:.4f}')
        print('-' * 30)

print(f'Best Combination: Oversample {best_combination[0] * 100}%, Undersample {best_combination[1] * 100}%')
print(f'Best Recall: {best_recall:.4f}')
print(f'Best Accuracy: {best_accuracy:.4f}')
print(f'Best Specificity: {best_specificity:.4f}')
print(f'Best F1: {best_f1:.4f}')


In [None]:
# LR Hybrid
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
undersampling_percentages = np.arange(0.1, 0.9, 0.1)

results_df = pd.DataFrame(columns=['Oversampling', 'Undersampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

best_combination = None
best_recall = 0
best_accuracy = 0
best_specificity = 0
best_f1 = 0

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))


for oversample_percentage in oversampling_percentages:
    smote = SMOTE(sampling_strategy={0: class_0, 1: round(class_0 * oversample_percentage)}, random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    print('*' * 50)


    class_1_first = int(len(y_train_smote[y_train_smote == 1]))

    for undersample_percentage in undersampling_percentages:
        
        undersampler = RandomUnderSampler(sampling_strategy={0: round(class_1 / undersample_percentage), 1: class_1_first}, random_state=42)
        x_train_hybrid, y_train_hybrid = undersampler.fit_resample(x_train_smote, y_train_smote)
        lr_classifier = LogisticRegression(random_state=42)

        lr_classifier.fit(x_train_hybrid, y_train_hybrid)

        y_pred = lr_classifier.predict(X_test)

        recall = recall_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)

        conf_matrix = confusion_matrix(y_test, y_pred)
        true_negative = conf_matrix[0, 0]
        false_positive = conf_matrix[0, 1]
        specificity = true_negative / (true_negative + false_positive)

        f1 = f1_score(y_test, y_pred)

        if recall > best_recall:
            best_recall = recall
            best_accuracy = accuracy
            best_specificity = specificity
            best_f1 = f1_score(y_test, y_pred)
            best_combination = (oversample_percentage, undersample_percentage)

        print(f'Oversampling Percentage: {oversample_percentage * 100}%')
        print(f'Undersampling Percentage: {undersample_percentage * 100}%')
        print(f'Recall: {recall:.4f}')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Specificity: {specificity:.4f}')
        print(f'F1-Score: {f1:.4f}')
        print('-' * 30)

        results_df = results_df.append({
            'Oversampling': oversample_percentage,
            'Undersampling': undersample_percentage,
            'Accuracy': accuracy,
            'Recall': recall,
            'Specificity': specificity,
            'F1-Score': f1
        }, ignore_index=True)

print(f'Best Combination: Oversample {best_combination[0] * 100}%, Undersample {best_combination[1] * 100}%')
print(f'Best Recall: {best_recall:.4f}')
print(f'Best Accuracy: {best_accuracy:.4f}')
print(f'Best Specificity: {best_specificity:.4f}')
print(f'Best F1: {best_f1:.4f}')

## Ensemble Learning (DT)

In [None]:
# EMSEMBLE BOOSTING
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
accuracies = []
recalls = []
specificities = []
f1Scores = []

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))

x_train_current, y_train_current = X_train, y_train

base_classifier = DecisionTreeClassifier(random_state=42)
xgb_classifier = XGBClassifier(base_classifier=base_classifier, random_state=42)

xgb_classifier.fit(x_train_current, y_train_current)

y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)

recall = recall_score(y_test, y_pred)
recalls.append(recall)

conf_matrix = confusion_matrix(y_test, y_pred)
true_negative = conf_matrix[0, 0]
false_positive = conf_matrix[0, 1]
specificity = true_negative / (true_negative + false_positive)
specificities.append(specificity)

f1Scores.append(f1_score(y_test, y_pred))

print('#' * 30)
print(f'Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Specificity: {specificity:.4f}')
print(f'F1-Score: {f1_score(y_test, y_pred):.4f}')
print('-' * 30)

In [None]:
#ENSEMBLE BOOSTING SMOTE
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
print(oversampling_percentages)
accuracies = []
recalls = []
specificities = []
f1Scores = []

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))

results_df = pd.DataFrame(columns=['Oversampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

for oversample_percentage in oversampling_percentages:
    smote = SMOTE(sampling_strategy={0: class_0, 1: round(class_0 * oversample_percentage)}, random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    base_classifier = DecisionTreeClassifier(random_state=42)
    xgb_classifier = XGBClassifier(base_classifier=base_classifier, random_state=42)

    xgb_classifier.fit(x_train_smote, y_train_smote)

    y_pred = xgb_classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    recall = recall_score(y_test, y_pred)
    recalls.append(recall)

    conf_matrix = confusion_matrix(y_test, y_pred)
    true_negative = conf_matrix[0, 0]
    false_positive = conf_matrix[0, 1]
    specificity = true_negative / (true_negative + false_positive)
    specificities.append(specificity)

    f1Scores.append(f1_score(y_test, y_pred))

    print('#' * 30)
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'Specificity: {specificity:.4f}')
    print(f'F1-Score: {f1_score(y_test, y_pred):.4f}')
    print('-' * 30)

    results_df = results_df.append({
        'Oversampling': oversample_percentage,
        'Accuracy': accuracy,
        'Recall': recall,
        'Specificity': specificity,
        'F1-Score': f1_score(y_test, y_pred)
    }, ignore_index=True)

plt.plot(oversampling_percentages, accuracies, marker='o', label='Accuracy')
plt.plot(oversampling_percentages, recalls, marker='o', label='Recall')
plt.plot(oversampling_percentages, specificities, marker='o', label='Specificity')
plt.plot(oversampling_percentages, f1Scores, marker='o', label='F1-Score')
plt.title('SMOTE on XGBoost with Decision Trees')
plt.xlabel('Oversampling')
plt.ylabel('Percentage')
plt.xticks(oversampling_percentages) 
plt.legend()
plt.grid(True)
plt.show()


In [None]:
#  ENSEMBLE BOOSTING UNDER
undersampling_percentages = np.arange(0.1, 0.9, 0.1)
print(undersampling_percentages)
accuracies = []
recalls = []
specificities = []
f1Scores = []

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))

results_df = pd.DataFrame(columns=['Oversampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

for undersample_percentage in undersampling_percentages:
    undersampler = RandomUnderSampler(sampling_strategy={0: round(class_1 / undersample_percentage), 1: class_1}, random_state=42)
    x_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

    base_classifier = DecisionTreeClassifier(random_state=42)
    xgb_classifier = XGBClassifier(base_classifier=base_classifier, random_state=42)

    xgb_classifier.fit(x_train_undersampled, y_train_undersampled)

    y_pred = xgb_classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    recall = recall_score(y_test, y_pred)
    recalls.append(recall)

    conf_matrix = confusion_matrix(y_test, y_pred)
    true_negative = conf_matrix[0, 0]
    false_positive = conf_matrix[0, 1]
    specificity = true_negative / (true_negative + false_positive)
    specificities.append(specificity)

    f1Scores.append(f1_score(y_test, y_pred))

    print('#' * 30)
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'Specificity: {specificity:.4f}')
    print(f'F1-Score: {f1_score(y_test, y_pred):.4f}')
    print('-' * 30)

    results_df = results_df.append({
        'Oversampling': oversample_percentage,
        'Accuracy': accuracy,
        'Recall': recall,
        'Specificity': specificity,
        'F1-Score': f1_score(y_test, y_pred)
    }, ignore_index=True)

# Plotting the results
plt.plot(undersampling_percentages, accuracies, marker='o', label='Accuracy')
plt.plot(undersampling_percentages, recalls, marker='o', label='Recall')
plt.plot(undersampling_percentages, specificities, marker='o', label='Specificity')
plt.plot(undersampling_percentages, f1Scores, marker='o', label='F1-Score')
plt.title('Under Sampling on XGBoost with Decision Tree')
plt.xlabel('Undersampling')
plt.ylabel('Percentage')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# BOOSTING HYBRID
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
undersampling_percentages = np.arange(0.1, 0.9, 0.1)

best_combination = None
best_recall = 0
best_accuracy = 0
best_specificity = 0
best_f1 = 0

results_df = pd.DataFrame(columns=['Oversampling', 'Undersampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))


for oversample_percentage in oversampling_percentages:
    smote = SMOTE(sampling_strategy={0: class_0, 1: round(class_0 * oversample_percentage)}, random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    class_1_first = int(len(y_train_smote[y_train_smote == 1]))

    for undersample_percentage in undersampling_percentages:
        undersampler = RandomUnderSampler(sampling_strategy={0: round(class_1 / undersample_percentage), 1: class_1_first}, random_state=42)
        x_train_hybrid, y_train_hybrid = undersampler.fit_resample(x_train_smote, y_train_smote)
        base_classifier = DecisionTreeClassifier(random_state=42)
        xgb_classifier = XGBClassifier(base_classifier=base_classifier, random_state=42)

        xgb_classifier.fit(x_train_hybrid, y_train_hybrid)

        y_pred = xgb_classifier.predict(X_test)

        recall = recall_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)

        conf_matrix = confusion_matrix(y_test, y_pred)
        true_negative = conf_matrix[0, 0]
        false_positive = conf_matrix[0, 1]
        specificity = true_negative / (true_negative + false_positive)
        f1 = f1_score(y_test, y_pred)

        if f1_score(y_test, y_pred) > best_f1 and recall > best_recall:
            best_recall = recall
            best_accuracy = accuracy
            best_specificity = specificity
            best_f1 = f1_score(y_test, y_pred)
            best_combination = (oversample_percentage, undersample_percentage)

        print(f'Oversampling Percentage: {oversample_percentage * 100}%')
        print(f'Undersampling Percentage: {undersample_percentage * 100}%')
        print(f'Recall: {recall:.4f}')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Specificity: {specificity:.4f}')
        print(f'F1-Score: {f1:.4f}')

        results_df = results_df.append({
            'Oversampling': oversample_percentage,
            'Undersampling': undersample_percentage,
            'Accuracy': accuracy,
            'Recall': recall,
            'Specificity': specificity,
            'F1-Score': f1
        }, ignore_index=True)

        print('-' * 30)

print(f'Best Combination: Oversample {best_combination[0] * 100}%, Undersample {best_combination[1] * 100}%')
print(f'Best Recall: {best_recall:.4f}')
print(f'Best Accuracy: {best_accuracy:.4f}')
print(f'Best Specificity: {best_specificity:.4f}')
print(f'Best F1: {best_f1:.4f}')

In [None]:
# EMSEMBLE BAGGING
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
print(oversampling_percentages)
accuracies = []
recalls = []
specificities = []
f1Scores = []

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))

x_train_current, y_train_current = X_train, y_train

base_classifier = DecisionTreeClassifier(random_state=42)
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

bagging_classifier.fit(x_train_current, y_train_current)

y_pred = bagging_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)

recall = recall_score(y_test, y_pred)
recalls.append(recall)

conf_matrix = confusion_matrix(y_test, y_pred)
true_negative = conf_matrix[0, 0]
false_positive = conf_matrix[0, 1]
specificity = true_negative / (true_negative + false_positive)
specificities.append(specificity)

f1Scores.append(f1_score(y_test, y_pred))

# Print the results for each iteration
print('#' * 30)
print(f'Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Specificity: {specificity:.4f}')
print(f'F1-Score: {f1_score(y_test, y_pred):.4f}')
print('-' * 30)

In [None]:
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
print(oversampling_percentages)
accuracies = []
recalls = []
specificities = []
f1Scores = []

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))

results_df = pd.DataFrame(columns=['Oversampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

for oversample_percentage in oversampling_percentages:
    smote = SMOTE(sampling_strategy={0: class_0, 1: round(class_0 * oversample_percentage)}, random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    base_classifier = DecisionTreeClassifier(random_state=42)
    xgb_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

    xgb_classifier.fit(x_train_smote, y_train_smote)

    y_pred = xgb_classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    recall = recall_score(y_test, y_pred)
    recalls.append(recall)

    conf_matrix = confusion_matrix(y_test, y_pred)
    true_negative = conf_matrix[0, 0]
    false_positive = conf_matrix[0, 1]
    specificity = true_negative / (true_negative + false_positive)
    specificities.append(specificity)

    f1Scores.append(f1_score(y_test, y_pred))

    print('#' * 30)
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'Specificity: {specificity:.4f}')
    print(f'F1-Score: {f1_score(y_test, y_pred):.4f}')
    print('-' * 30)

    results_df = results_df.append({
        'Oversampling': oversample_percentage,
        'Accuracy': accuracy,
        'Recall': recall,
        'Specificity': specificity,
        'F1-Score': f1_score(y_test, y_pred)
    }, ignore_index=True)

plt.plot(oversampling_percentages, accuracies, marker='o', label='Accuracy')
plt.plot(oversampling_percentages, recalls, marker='o', label='Recall')
plt.plot(oversampling_percentages, specificities, marker='o', label='Specificity')
plt.plot(oversampling_percentages, f1Scores, marker='o', label='F1-Score')
plt.title('SMOTE on Bagging with Decision Trees')
plt.xlabel('Oversampling')
plt.ylabel('Percentage')
plt.xticks(oversampling_percentages)
plt.legend()
plt.grid(True)
plt.show()


In [None]:
undersampling_percentages = np.arange(0.1, 0.9, 0.1)
print(undersampling_percentages)
accuracies = []
recalls = []
specificities = []
f1Scores = []

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))


results_df = pd.DataFrame(columns=['Oversampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

for undersample_percentage in undersampling_percentages:
    print()
    print('#' * 30)
    print(f'Undersampling Percentage: {undersample_percentage * 100}%')
    print(f'Count of 0: {round(class_1 / undersample_percentage)}')
    print(f'Count of 1: {class_1}')

    undersampler = RandomUnderSampler(sampling_strategy={0: round(class_1 / undersample_percentage), 1: class_1}, random_state=42)
    x_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

    base_classifier = DecisionTreeClassifier(random_state=42)
    xgb_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

    xgb_classifier.fit(x_train_undersampled, y_train_undersampled)

    y_pred = xgb_classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    recall = recall_score(y_test, y_pred)
    recalls.append(recall)

    conf_matrix = confusion_matrix(y_test, y_pred)
    true_negative = conf_matrix[0, 0]
    false_positive = conf_matrix[0, 1]
    specificity = true_negative / (true_negative + false_positive)
    specificities.append(specificity)

    f1Scores.append(f1_score(y_test, y_pred))

    print('#' * 30)
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'Specificity: {specificity:.4f}')
    print(f'F1-Score: {f1_score(y_test, y_pred):.4f}')
    print('-' * 30)

    results_df = results_df.append({
        'Oversampling': oversample_percentage,
        'Accuracy': accuracy,
        'Recall': recall,
        'Specificity': specificity,
        'F1-Score': f1_score(y_test, y_pred)
    }, ignore_index=True)

# Plotting the results
plt.plot(undersampling_percentages, accuracies, marker='o', label='Accuracy')
plt.plot(undersampling_percentages, recalls, marker='o', label='Recall')
plt.plot(undersampling_percentages, specificities, marker='o', label='Specificity')
plt.plot(undersampling_percentages, f1Scores, marker='o', label='F1-Score')
plt.title('Under Sampling on Bagging with Decision Trees')
plt.xlabel('Undersampling')
plt.ylabel('Percentage')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Bagging HYBRID
oversampling_percentages = np.arange(0.1, 1.1, 0.1)
undersampling_percentages = np.arange(0.1, 0.9, 0.1)

results_df = pd.DataFrame(columns=['Oversampling', 'Undersampling', 'Accuracy', 'Recall', 'Specificity', 'F1-Score'])

best_combination = None
best_recall = 0
best_accuracy = 0
best_specificity = 0
best_f1 = 0

class_0 = int(len(y_train[y_train == 0]))
class_1 = int(len(y_train[y_train == 1]))


for oversample_percentage in oversampling_percentages:
    smote = SMOTE(sampling_strategy={0: class_0, 1: round(class_0 * oversample_percentage)}, random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    class_1_first = int(len(y_train_smote[y_train_smote == 1]))

    for undersample_percentage in undersampling_percentages:
        undersampler = RandomUnderSampler(sampling_strategy={0: round(class_1 / undersample_percentage), 1: class_1_first}, random_state=42)
        x_train_hybrid, y_train_hybrid = undersampler.fit_resample(x_train_smote, y_train_smote)

        base_classifier = DecisionTreeClassifier(random_state=42)
        xgb_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

        xgb_classifier.fit(x_train_hybrid, y_train_hybrid)

        y_pred = xgb_classifier.predict(X_test)

        recall = recall_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)

        conf_matrix = confusion_matrix(y_test, y_pred)
        true_negative = conf_matrix[0, 0]
        false_positive = conf_matrix[0, 1]
        specificity = true_negative / (true_negative + false_positive)
        f1 = f1_score(y_test, y_pred)

        if recall > best_recall:
            best_recall = recall
            best_accuracy = accuracy
            best_specificity = specificity
            best_f1 = f1_score(y_test, y_pred)
            best_combination = (oversample_percentage, undersample_percentage)

        print(f'Oversampling Percentage: {oversample_percentage * 100}%')
        print(f'Undersampling Percentage: {undersample_percentage * 100}%')
        print(f'Recall: {recall:.4f}')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Specificity: {specificity:.4f}')
        print(f'F1-Score: {f1:.4f}')
        print('-' * 30)

        results_df = results_df.append({
            'Oversampling': oversample_percentage,
            'Undersampling': undersample_percentage,
            'Accuracy': accuracy,
            'Recall': recall,
            'Specificity': specificity,
            'F1-Score': f1
        }, ignore_index=True)


print(f'Best Combination: Oversample {best_combination[0] * 100}%, Undersample {best_combination[1] * 100}%')
print(f'Best Recall: {best_recall:.4f}')
print(f'Best Accuracy: {best_accuracy:.4f}')
print(f'Best Specificity: {best_specificity:.4f}')
print(f'Best F1: {best_f1:.4f}')