In [17]:
# Standard libraries
import sys  # System-specific parameters and functions
import os   # Miscellaneous operating system interfaces
import warnings  # Warning control
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations

# Visualization
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Statistical data visualization
from matplotlib.colors import ListedColormap  # Colormap utilities

# Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler  # Preprocessing tools
from sklearn import model_selection, metrics, preprocessing  # Model selection, evaluation, and preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # Model selection and evaluation
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier  # Ensemble methods
from xgboost import XGBClassifier  # Extreme Gradient Boosting
from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron

# Statistical analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Variance inflation factor
from scipy.stats import pointbiserialr, chi2_contingency, spearmanr, entropy  # Statistical functions
from statsmodels.graphics.gofplots import qqplot  # Q-Q plot
from collections import Counter  # Container datatypes

# Tabulate
from tabulate import tabulate  # Pretty-print tabular data

# Set visualization style
#sns.set()  # Set Seaborn default style
#plt.style.use('ggplot')  # Set ggplot style for matplotlib

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# Balancing techniques
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [16]:

from google.colab import files


uploaded = files.upload()

Saving encoded_df (2).csv to encoded_df (2) (1).csv


In [19]:
#import encoded dataset
import io

encoded_df = pd.read_csv(io.BytesIO(uploaded['encoded_df (2) (1).csv']))
encoded_df.head()

Unnamed: 0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_day_of_week,...,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud,Month,Day,Year
0,46,1,1.0,85,38301,1,1,1,80006,4,...,7530.940993,9.0,0,12885.45235,6,16161.33381,0,12,16,2016
1,21,0,0.0,75,30445,0,1,1,15021,3,...,2966.024895,4.0,2,29429.45218,6,28691.96422,0,2,12,2015
2,49,0,0.0,87,38923,0,1,0,20158,1,...,6283.888333,3.0,0,21701.18195,6,22090.94758,1,12,6,2016
3,58,0,1.0,58,40605,1,0,0,15024,3,...,6169.747994,4.0,1,13198.27344,3,38329.58106,1,5,5,2016
4,38,1,1.0,95,36380,1,0,1,50034,1,...,4541.38715,7.0,1,38060.21122,2,25876.56319,0,10,27,2015


In [20]:
X=encoded_df.drop(columns=['fraud'])
y=encoded_df['fraud']

#20% allocated for test data and 80% for train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

In [21]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [22]:
continuous_vars = encoded_df.select_dtypes(include=['int64', 'float64']).columns
Continuous_Variables_in_X_train = continuous_vars

categorical_vars = encoded_df.select_dtypes(include=['object', 'category']).columns
Categorical_Variables_in_X_train = [var for var in categorical_vars if var != 'fraud']

Date_Month_Year= encoded_df.select_dtypes(include=['int32']).columns

# Display the variables
print("Continuous Variables in X_train:")
print(Continuous_Variables_in_X_train)
print("\nCategorical Variables in X train:")
print(Categorical_Variables_in_X_train)
print("\nDate_Month_Year:")
print(Date_Month_Year)

Continuous Variables in X_train:
Index(['age_of_driver', 'gender', 'marital_status', 'safty_rating',
       'annual_income', 'high_education_ind', 'address_change_ind',
       'living_status', 'zip_code', 'claim_day_of_week', 'accident_site',
       'past_num_of_claims', 'witness_present_ind', 'liab_prct', 'channel',
       'policy_report_filed_ind', 'claim_est_payout', 'age_of_vehicle',
       'vehicle_category', 'vehicle_price', 'vehicle_color', 'vehicle_weight',
       'fraud', 'Month', 'Day', 'Year'],
      dtype='object')

Categorical Variables in X train:
[]

Date_Month_Year:
Index([], dtype='object')


#### checking imbalance of response

In [23]:
print ('Total not committed fraud :  {} and its percentage is {} %'.format(encoded_df.fraud.value_counts()[0], round(encoded_df.fraud.value_counts()[0]/encoded_df.fraud.value_counts().sum()*100,2)) )
print ('Total committed fraud :  {} and its percentage is {} %'.format(encoded_df.fraud.value_counts()[1], round(encoded_df.fraud.value_counts()[1]/encoded_df.fraud.value_counts().sum()*100,2)) )

Total not committed fraud :  15040 and its percentage is 84.32 %
Total committed fraud :  2796 and its percentage is 15.68 %


## --------------------------------------------------------------------------------------------------------

In [24]:
df_backup = encoded_df.copy()
df_backup.head()

Unnamed: 0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_day_of_week,...,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud,Month,Day,Year
0,46,1,1.0,85,38301,1,1,1,80006,4,...,7530.940993,9.0,0,12885.45235,6,16161.33381,0,12,16,2016
1,21,0,0.0,75,30445,0,1,1,15021,3,...,2966.024895,4.0,2,29429.45218,6,28691.96422,0,2,12,2015
2,49,0,0.0,87,38923,0,1,0,20158,1,...,6283.888333,3.0,0,21701.18195,6,22090.94758,1,12,6,2016
3,58,0,1.0,58,40605,1,0,0,15024,3,...,6169.747994,4.0,1,13198.27344,3,38329.58106,1,5,5,2016
4,38,1,1.0,95,36380,1,0,1,50034,1,...,4541.38715,7.0,1,38060.21122,2,25876.56319,0,10,27,2015


## Upsampled

In [25]:
X_backup = df_backup.drop(columns=['fraud'])
y_backup = df_backup['fraud']

#20% allocated for test data and 80% for train data
X_train_backup, X_test_backup, y_train_backup, y_test_backup = train_test_split(X_backup, y_backup, test_size=0.2, random_state=28)

In [26]:
#separate majority and minority classes
majority_class = X_train_backup[y_train_backup == 0]
minority_class = X_train_backup[y_train_backup == 1]

print("Size of majority class before upsampling:", majority_class.shape[0])
print("Size of minority class before upsampling:", minority_class.shape[0])

Size of majority class before upsampling: 12036
Size of minority class before upsampling: 2232


In [27]:
#upsample minority class
minority_upsampled = resample(minority_class,
                              replace=True,
                              n_samples=len(majority_class),
                              random_state=28)

X_upsampled = np.vstack([majority_class, minority_upsampled])
y_upsampled = np.concatenate([np.zeros(len(majority_class)), np.ones(len(majority_class))])

#shuffle data
shuffle_indices = np.arange(len(X_upsampled))
np.random.shuffle(shuffle_indices)
X_upsampled = X_upsampled[shuffle_indices]
y_upsampled = y_upsampled[shuffle_indices]

unique_classes, class_counts = np.unique(y_upsampled, return_counts=True)
print("Class counts after upsampling:")
for cls, count in zip(unique_classes, class_counts):
  print(f"Class {int(cls)}: {count}")



Class counts after upsampling:
Class 0: 12036
Class 1: 12036


### Logistic Regression

In [None]:
lr = LogisticRegression()
result = lr.fit(X_upsampled,y_upsampled)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  57.57 %
Misclassification rate of this model:  42.43 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.872     0.581     0.698      3004
           1      0.197     0.546     0.289       564

    accuracy                          0.576      3568
   macro avg      0.534     0.564     0.493      3568
weighted avg      0.765     0.576     0.633      3568

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 1746 |                 1258 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  256 |                  308 |
+-----------------+----------------------+----------------------+


### Decision Trees

In [None]:
dt = DecisionTreeClassifier(random_state=28)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_upsampled, y_upsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2}
0.9103938452272944


In [None]:
dtree = DecisionTreeClassifier(max_depth = None,  min_samples_split = 2, min_samples_leaf = 1, max_features = 'log2', random_state = 28)
dtree.fit(X_upsampled,y_upsampled)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  75.28 %
Misclassification rate of this model:  24.72 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.851     0.857     0.854      3004
           1      0.208     0.200     0.204       564

    accuracy                          0.753      3568
   macro avg      0.529     0.528     0.529      3568
weighted avg      0.749     0.753     0.751      3568

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 2573 |                  431 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  451 |                  113 |
+-----------------+----------------------+----------------------+


### Random Forest

In [None]:
np.random.seed(28)

rf = RandomForestClassifier(random_state=28)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_upsampled, y_upsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

In [None]:
rf = RandomForestClassifier(n_estimators=50, max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=3, random_state=28)
rf.fit(X_upsampled,y_upsampled)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### Ada boosting

In [None]:
np.random.seed(28)

ada_classifier = AdaBoostClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_upsampled, y_upsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 150}
Best Score: 0.6790048840759967


In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=150, learning_rate=1.0, algorithm='SAMME.R', random_state=28)

ada_classifier.fit(X_upsampled, y_upsampled)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  67.26 %
Misclassification rate of this model:  32.74 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.909     0.679     0.777      3004
           1      0.272     0.640     0.382       564

    accuracy                          0.673      3568
   macro avg      0.591     0.659     0.580      3568
weighted avg      0.809     0.673     0.715      3568

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 2039 |                  965 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  203 |                  361 |
+-----------------+----------------------+----------------------+


### Gradient Boosting

In [29]:
np.random.seed(28)

gb_classifier = GradientBoostingClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_upsampled, y_upsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': None, 'n_estimators': 200}
Best Score: 0.91326031163002


In [30]:
gb_classifier = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth = None, random_state=28)
gb_classifier.fit(X_upsampled, y_upsampled)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  76.26 %
Misclassification rate of this model:  23.74 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.851     0.871     0.861      3004
           1      0.213     0.186     0.199       564

    accuracy                          0.763      3568
   macro avg      0.532     0.529     0.530      3568
weighted avg      0.750     0.763     0.756      3568

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 2616 |                  388 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  459 |                  105 |
+-----------------+----------------------+----------------------+


### Multilayer Perceptron

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_upsampled)
X_test_scaled = scaler.transform(X_test)


In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_upsampled)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  71.75 %
Misclassification rate of this model:  28.25 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.863     0.790     0.825      3004
           1      0.228     0.330     0.270       564

    accuracy                          0.717      3568
   macro avg      0.545     0.560     0.547      3568
weighted avg      0.762     0.717     0.737      3568

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 2374 |                  630 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  378 |                  186 |
+-----------------+----------------------+----------------------+


### k Nearest Neighbours

In [None]:


np.random.seed(28)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_upsampled, y_upsampled)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Best Score: 0.8431792094794475


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_upsampled, y_upsampled)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  65.19 %
Misclassification rate of this model:  34.81 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.844     0.720     0.777      3004
           1      0.162     0.289     0.208       564

    accuracy                          0.652      3568
   macro avg      0.503     0.505     0.492      3568
weighted avg      0.736     0.652     0.687      3568

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 2163 |                  841 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  401 |                  163 |
+-----------------+----------------------+----------------------+


### Support Vector Machines

In [14]:
np.random.seed(28)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_upsampled, y_upsampled, cv=5)
  hist.append(np.mean(cross_val))
print(hist)


[0.9948904221462065, 0.9948904221462065, 0.9948904221462065, 0.9948904221462065]


In [28]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.001)
svm_classifier.fit(X_upsampled, y_upsampled)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  84.19 %
Misclassification rate of this model:  15.81 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.842     1.000     0.914      3004
           1      0.000     0.000     0.000       564

    accuracy                          0.842      3568
   macro avg      0.421     0.500     0.457      3568
weighted avg      0.709     0.842     0.770      3568

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 3004 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  564 |                    0 |
+-----------------+----------------------+----------------------+


### Naive Bias

In [None]:
np.random.seed(28)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_upsampled, y_upsampled)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'var_smoothing': 9.999999999999999e-10}
Best Score: 0.6230059350086996


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_upsampled, y_upsampled)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  65.5 %
Misclassification rate of this model:  34.5 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.901     0.663     0.764      3004
           1      0.254     0.610     0.359       564

    accuracy                          0.655      3568
   macro avg      0.577     0.637     0.561      3568
weighted avg      0.798     0.655     0.700      3568

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 1993 |                 1011 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  220 |                  344 |
+-----------------+----------------------+----------------------+
