In [1]:
# Standard libraries
import sys  # System-specific parameters and functions
import os   # Miscellaneous operating system interfaces
import warnings  # Warning control
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations

# Visualization
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Statistical data visualization
from matplotlib.colors import ListedColormap  # Colormap utilities

# Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler  # Preprocessing tools
from sklearn import model_selection, metrics, preprocessing  # Model selection, evaluation, and preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # Model selection and evaluation
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier  # Ensemble methods
from xgboost import XGBClassifier  # Extreme Gradient Boosting
from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron

# Statistical analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Variance inflation factor
from scipy.stats import pointbiserialr, chi2_contingency, spearmanr, entropy  # Statistical functions
from statsmodels.graphics.gofplots import qqplot  # Q-Q plot
from collections import Counter  # Container datatypes

# Tabulate
from tabulate import tabulate  # Pretty-print tabular data

# Set visualization style
#sns.set()  # Set Seaborn default style
#plt.style.use('ggplot')  # Set ggplot style for matplotlib

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# Balancing techniques
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [2]:
#import encoded dataset
import io

df = pd.read_csv('cleaned_dataset.csv')
df.head()

Unnamed: 0,Survey_id,Ville_id,sex,Age,Married,Number_children,education_level,total_members,gained_asset,durable_asset,...,incoming_salary,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,926,91,1,28,1,4,10,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,747,57,1,23,1,3,8,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,1190,115,1,22,1,3,9,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,1065,97,1,27,1,2,10,4,52667108,19698904,...,0,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,806,42,0,59,0,4,10,6,82606287,17352654,...,1,0,0,0,53384566,20731006,1,20100562,43419447.0,0


In [3]:
X=df.drop(columns=['depressed'])
y=df['depressed']

#20% allocated for test data and 80% for train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

In [4]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [5]:
continuous_vars = df.select_dtypes(include=['int64', 'float64']).columns
Continuous_Variables_in_X_train = continuous_vars

categorical_vars = df.select_dtypes(include=['object', 'category']).columns
Categorical_Variables_in_X_train = [var for var in categorical_vars if var != 'depressed']

Date_Month_Year= df.select_dtypes(include=['int32']).columns

# Display the variables
print("Continuous Variables in X_train:")
print(Continuous_Variables_in_X_train)
print("\nCategorical Variables in X train:")
print(Categorical_Variables_in_X_train)
print("\nDate_Month_Year:")
print(Date_Month_Year)

Continuous Variables in X_train:
Index(['Survey_id', 'Ville_id', 'sex', 'Age', 'Married', 'Number_children',
       'education_level', 'total_members', 'gained_asset', 'durable_asset',
       'save_asset', 'living_expenses', 'other_expenses', 'incoming_salary',
       'incoming_own_farm', 'incoming_business', 'incoming_no_business',
       'incoming_agricultural', 'farm_expenses', 'labor_primary',
       'lasting_investment', 'no_lasting_investmen', 'depressed'],
      dtype='object')

Categorical Variables in X train:
[]

Date_Month_Year:
Index([], dtype='object')


#### checking imbalance of response

In [None]:
print ('Total not committed fraud :  {} and its percentage is {} %'.format(df.depressed.value_counts()[0], round(df.depressed.value_counts()[0]/df.depressed.value_counts().sum()*100,2)) )
print ('Total committed fraud :  {} and its percentage is {} %'.format(df.depressed.value_counts()[1], round(df.depressed.value_counts()[1]/df.depressed.value_counts().sum()*100,2)) )

Total not committed fraud :  1174 and its percentage is 83.32 %
Total committed fraud :  235 and its percentage is 16.68 %


## --------------------------------------------------------------------------------------------------------

In [6]:
df_backup = df.copy()
df_backup.head()

Unnamed: 0,Survey_id,Ville_id,sex,Age,Married,Number_children,education_level,total_members,gained_asset,durable_asset,...,incoming_salary,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,926,91,1,28,1,4,10,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,747,57,1,23,1,3,8,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,1190,115,1,22,1,3,9,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,1065,97,1,27,1,2,10,4,52667108,19698904,...,0,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,806,42,0,59,0,4,10,6,82606287,17352654,...,1,0,0,0,53384566,20731006,1,20100562,43419447.0,0


## Upsampled

In [7]:
X_backup = df_backup.drop(columns=['depressed'])
y_backup = df_backup['depressed']

#20% allocated for test data and 80% for train data
X_train_backup, X_test_backup, y_train_backup, y_test_backup = train_test_split(X_backup, y_backup, test_size=0.2, random_state=28)

In [8]:
#separate majority and minority classes
majority_class = X_train_backup[y_train_backup == 0]
minority_class = X_train_backup[y_train_backup == 1]

print("Size of majority class before upsampling:", majority_class.shape[0])
print("Size of minority class before upsampling:", minority_class.shape[0])

Size of majority class before upsampling: 938
Size of minority class before upsampling: 189


In [9]:
#upsample minority class
minority_upsampled = resample(minority_class,
                              replace=True,
                              n_samples=len(majority_class),
                              random_state=28)

X_upsampled = np.vstack([majority_class, minority_upsampled])
y_upsampled = np.concatenate([np.zeros(len(majority_class)), np.ones(len(majority_class))])

#shuffle data
shuffle_indices = np.arange(len(X_upsampled))
np.random.shuffle(shuffle_indices)
X_upsampled = X_upsampled[shuffle_indices]
y_upsampled = y_upsampled[shuffle_indices]

unique_classes, class_counts = np.unique(y_upsampled, return_counts=True)
print("Class counts after upsampling:")
for cls, count in zip(unique_classes, class_counts):
  print(f"Class {int(cls)}: {count}")

Class counts after upsampling:
Class 0: 938
Class 1: 938


### Logistic Regression

In [None]:
lr = LogisticRegression()
result = lr.fit(X_upsampled,y_upsampled)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  52.13 %
Misclassification rate of this model:  47.87 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.822     0.547     0.656       236
           1      0.144     0.391     0.211        46

    accuracy                          0.521       282
   macro avg      0.483     0.469     0.434       282
weighted avg      0.711     0.521     0.584       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  129 |                  107 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   28 |                   18 |
+-----------------+----------------------+----------------------+


### Decision Trees

In [None]:
dt = DecisionTreeClassifier(random_state=28)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2', None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_upsampled, y_upsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3}
0.8944666666666666


In [None]:
dtree = DecisionTreeClassifier(max_depth = None,  min_samples_split = 3, min_samples_leaf = 1, max_features = None, random_state = 28)
dtree.fit(X_upsampled,y_upsampled)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  72.7 %
Misclassification rate of this model:  27.3 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.824     0.856     0.840       236
           1      0.081     0.065     0.072        46

    accuracy                          0.727       282
   macro avg      0.453     0.461     0.456       282
weighted avg      0.703     0.727     0.715       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  202 |                   34 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   43 |                    3 |
+-----------------+----------------------+----------------------+


### Random Forest

In [None]:
np.random.seed(28)

rf = RandomForestClassifier(random_state=28)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_upsampled, y_upsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
0.9722780141843972


In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=2, random_state=28)
rf.fit(X_upsampled,y_upsampled)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  82.27 %
Misclassification rate of this model:  17.73 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     0.979     0.902       236
           1      0.167     0.022     0.038        46

    accuracy                          0.823       282
   macro avg      0.502     0.500     0.470       282
weighted avg      0.728     0.823     0.761       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  231 |                    5 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   45 |                    1 |
+-----------------+----------------------+----------------------+


### Ada boosting

In [None]:
np.random.seed(28)

ada_classifier = AdaBoostClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_upsampled, y_upsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 150}
Best Score: 0.7276212765957446


In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=150, learning_rate=1.0, algorithm='SAMME.R', random_state=28)

ada_classifier.fit(X_upsampled, y_upsampled)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  64.18 %
Misclassification rate of this model:  35.82 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.833     0.716     0.770       236
           1      0.152     0.261     0.192        46

    accuracy                          0.642       282
   macro avg      0.492     0.488     0.481       282
weighted avg      0.721     0.642     0.676       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  169 |                   67 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   34 |                   12 |
+-----------------+----------------------+----------------------+


### Gradient Boosting

In [None]:
np.random.seed(28)

gb_classifier = GradientBoostingClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_upsampled, y_upsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150}
Best Score: 0.9525631205673758


In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=150, learning_rate=0.2, max_depth = 5, random_state=28)
gb_classifier.fit(X_upsampled, y_upsampled)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  79.08 %
Misclassification rate of this model:  20.92 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     0.932     0.882       236
           1      0.158     0.065     0.092        46

    accuracy                          0.791       282
   macro avg      0.497     0.499     0.487       282
weighted avg      0.726     0.791     0.753       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  220 |                   16 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   43 |                    3 |
+-----------------+----------------------+----------------------+


### Multilayer Perceptron

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_upsampled)
X_test_scaled = scaler.transform(X_test)


In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_upsampled)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  74.47 %
Misclassification rate of this model:  25.53 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.857     0.835     0.845       236
           1      0.250     0.283     0.265        46

    accuracy                          0.745       282
   macro avg      0.553     0.559     0.555       282
weighted avg      0.758     0.745     0.751       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  197 |                   39 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   33 |                   13 |
+-----------------+----------------------+----------------------+


### k Nearest Neighbours

In [None]:
np.random.seed(28)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_upsampled, y_upsampled)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Best Score: 0.8288865248226951


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_upsampled, y_upsampled)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  60.64 %
Misclassification rate of this model:  39.36 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.821     0.678     0.742       236
           1      0.126     0.239     0.165        46

    accuracy                          0.606       282
   macro avg      0.473     0.459     0.454       282
weighted avg      0.707     0.606     0.648       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  160 |                   76 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   35 |                   11 |
+-----------------+----------------------+----------------------+


### Support Vector Machines

In [None]:
np.random.seed(28)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_upsampled, y_upsampled, cv=5)
  hist.append(np.mean(cross_val))
print(hist)

[0.8661999999999999, 0.934954609929078, 0.9834695035460992, 0.993068085106383]


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.1)
svm_classifier.fit(X_upsampled, y_upsampled)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Naive Bias

In [None]:
np.random.seed(28)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_upsampled, y_upsampled)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'var_smoothing': 0.43287612810830584}
Best Score: 0.5463929078014185


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_upsampled, y_upsampled)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  63.83 %
Misclassification rate of this model:  36.17 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.828     0.716     0.768       236
           1      0.141     0.239     0.177        46

    accuracy                          0.638       282
   macro avg      0.485     0.478     0.473       282
weighted avg      0.716     0.638     0.672       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  169 |                   67 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   35 |                   11 |
+-----------------+----------------------+----------------------+


### XGBoost

In [10]:
np.random.seed(28)

xgb_classifier = XGBClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_upsampled, y_upsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': None, 'n_estimators': 150}
Best Score: 0.9392453900709219


In [11]:
xgb_classifier = XGBClassifier(n_estimators=150, learning_rate=0.2, max_depth = None, random_state=28)
xgb_classifier.fit(X_upsampled, y_upsampled)
y_pred = xgb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  78.37 %
Misclassification rate of this model:  21.63 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.835     0.924     0.877       236
           1      0.143     0.065     0.090        46

    accuracy                          0.784       282
   macro avg      0.489     0.494     0.483       282
weighted avg      0.722     0.784     0.749       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  218 |                   18 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   43 |                    3 |
+-----------------+----------------------+----------------------+


## Downsampled

In [12]:
#downsample minority class
majority_downsampled = resample(majority_class,
                              replace=True,
                              n_samples=len(minority_class),
                              random_state=28)

X_downsampled = np.vstack([minority_class, majority_downsampled])
y_downsampled = np.concatenate([np.zeros(len(minority_class)), np.ones(len(minority_class))])

#shuffle data
shuffle_indices = np.arange(len(X_downsampled))
np.random.shuffle(shuffle_indices)
X_downsampled = X_downsampled[shuffle_indices]
y_downsampled = y_downsampled[shuffle_indices]

unique_classes, class_counts = np.unique(y_downsampled, return_counts=True)
print("Class counts after upsampling:")
for cls, count in zip(unique_classes, class_counts):
  print(f"Class {int(cls)}: {count}")

Class counts after upsampling:
Class 0: 189
Class 1: 189


### Logistic regreesion

In [None]:
lr = LogisticRegression()
result = lr.fit(X_downsampled,y_downsampled)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  50.35 %
Misclassification rate of this model:  49.65 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.838     0.504     0.630       236
           1      0.164     0.500     0.247        46

    accuracy                          0.504       282
   macro avg      0.501     0.502     0.438       282
weighted avg      0.728     0.504     0.567       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  119 |                  117 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   23 |                   23 |
+-----------------+----------------------+----------------------+


### Decision tree

In [None]:
dt = DecisionTreeClassifier(random_state=28)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2', None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_downsampled, y_downsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
0.5978245614035088


In [None]:
dtree = DecisionTreeClassifier(max_depth = 5,  min_samples_split = 2, min_samples_leaf = 2, max_features = None, random_state = 28)
dtree.fit(X_downsampled,y_downsampled)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  33.69 %
Misclassification rate of this model:  66.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.845     0.254     0.391       236
           1      0.166     0.761     0.272        46

    accuracy                          0.337       282
   macro avg      0.505     0.508     0.332       282
weighted avg      0.734     0.337     0.372       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   60 |                  176 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   11 |                   35 |
+-----------------+----------------------+----------------------+


### Random Forest

In [None]:
np.random.seed(28)

rf = RandomForestClassifier(random_state=28)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_downsampled, y_downsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': 4, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 80}
0.6190175438596491


In [None]:
rf = RandomForestClassifier(n_estimators=80, max_depth=4, max_features='log2', min_samples_leaf=1, min_samples_split=3, random_state=28)
rf.fit(X_downsampled,y_downsampled)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  45.39 %
Misclassification rate of this model:  54.61 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.853     0.419     0.562       236
           1      0.175     0.630     0.274        46

    accuracy                          0.454       282
   macro avg      0.514     0.525     0.418       282
weighted avg      0.743     0.454     0.515       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   99 |                  137 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   17 |                   29 |
+-----------------+----------------------+----------------------+


### AdaBoosting

In [None]:
np.random.seed(28)

ada_classifier = AdaBoostClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_downsampled, y_downsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 50}
Best Score: 0.5634385964912282


In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, algorithm='SAMME.R', random_state=28)

ada_classifier.fit(X_downsampled, y_downsampled)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  42.2 %
Misclassification rate of this model:  57.8 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.841     0.381     0.525       236
           1      0.166     0.630     0.262        46

    accuracy                          0.422       282
   macro avg      0.503     0.506     0.394       282
weighted avg      0.731     0.422     0.482       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   90 |                  146 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   17 |                   29 |
+-----------------+----------------------+----------------------+


### Gradient Boosting

In [None]:
np.random.seed(28)

gb_classifier = GradientBoostingClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_downsampled, y_downsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
Best Score: 0.5794385964912281


In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth = 4, random_state=28)
gb_classifier.fit(X_downsampled, y_downsampled)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  51.42 %
Misclassification rate of this model:  48.58 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.878     0.487     0.627       236
           1      0.199     0.652     0.305        46

    accuracy                          0.514       282
   macro avg      0.538     0.570     0.466       282
weighted avg      0.767     0.514     0.574       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  115 |                  121 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   16 |                   30 |
+-----------------+----------------------+----------------------+


### MLP

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_downsampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_downsampled)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  53.19 %
Misclassification rate of this model:  46.81 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.851     0.534     0.656       236
           1      0.179     0.522     0.267        46

    accuracy                          0.532       282
   macro avg      0.515     0.528     0.461       282
weighted avg      0.742     0.532     0.593       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  126 |                  110 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   22 |                   24 |
+-----------------+----------------------+----------------------+


### kNN

In [None]:
np.random.seed(28)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_downsampled, y_downsampled)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best Score: 0.5528771929824561


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_downsampled, y_downsampled)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  53.55 %
Misclassification rate of this model:  46.45 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.889     0.508     0.647       236
           1      0.211     0.674     0.321        46

    accuracy                          0.535       282
   macro avg      0.550     0.591     0.484       282
weighted avg      0.778     0.535     0.594       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  120 |                  116 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   15 |                   31 |
+-----------------+----------------------+----------------------+


### SVM

In [None]:
np.random.seed(28)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_downsampled, y_downsampled, cv=5)
  hist.append(np.mean(cross_val))
print(hist)

[0.5556140350877193, 0.5529122807017544, 0.5634385964912282, 0.5634385964912282]


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.1)
svm_classifier.fit(X_downsampled, y_downsampled)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Naive Bayes

In [None]:
np.random.seed(28)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_downsampled, y_downsampled)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'var_smoothing': 0.657933224657568}
Best Score: 0.5527368421052632


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_downsampled, y_downsampled)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  37.59 %
Misclassification rate of this model:  62.41 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.841     0.314     0.457       236
           1      0.165     0.696     0.267        46

    accuracy                          0.376       282
   macro avg      0.503     0.505     0.362       282
weighted avg      0.731     0.376     0.426       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   74 |                  162 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   14 |                   32 |
+-----------------+----------------------+----------------------+
Accuracy score of this model:  37.59 %
Misclassification rate of this model:  62.41 %

Report card of this model: 
              precision    re

### XGBoost

In [13]:
np.random.seed(28)

xgb_classifier = XGBClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_downsampled, y_downsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 150}
Best Score: 0.5767017543859649


In [14]:
xgb_classifier = XGBClassifier(n_estimators=150, learning_rate=0.05, max_depth = 3, random_state=28)
xgb_classifier.fit(X_downsampled, y_downsampled)
y_pred = xgb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  46.81 %
Misclassification rate of this model:  53.19 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.841     0.449     0.586       236
           1      0.167     0.565     0.257        46

    accuracy                          0.468       282
   macro avg      0.504     0.507     0.422       282
weighted avg      0.731     0.468     0.532       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  106 |                  130 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   20 |                   26 |
+-----------------+----------------------+----------------------+
