In [1]:
# Standard libraries
import sys  # System-specific parameters and functions
import os   # Miscellaneous operating system interfaces
import warnings  # Warning control
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations

# Visualization
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Statistical data visualization
from matplotlib.colors import ListedColormap  # Colormap utilities

# Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler  # Preprocessing tools
from sklearn import model_selection, metrics, preprocessing  # Model selection, evaluation, and preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # Model selection and evaluation
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier  # Ensemble methods
from xgboost import XGBClassifier  # Extreme Gradient Boosting
from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron

# Statistical analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Variance inflation factor
from scipy.stats import pointbiserialr, chi2_contingency, spearmanr, entropy  # Statistical functions
from statsmodels.graphics.gofplots import qqplot  # Q-Q plot
from collections import Counter  # Container datatypes

# Tabulate
from tabulate import tabulate

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# Balancing techniques
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [2]:
import gdown
import pandas as pd

# New file ID from the provided Google Drive link
file_id = '167qMEho9DmCL8Vo-GYjImsCtgFbI2nrs'
url = f'https://drive.google.com/uc?id={file_id}'

# Download the file
output_file = 'dataset.csv'
gdown.download(url, output=output_file, quiet=False)

# Read the downloaded CSV file into a pandas DataFrame
data = pd.read_csv(output_file)

# Display the first few rows of the dataframe
print(data.head())


Downloading...
From: https://drive.google.com/uc?id=167qMEho9DmCL8Vo-GYjImsCtgFbI2nrs
To: /content/dataset.csv
100%|██████████| 151k/151k [00:00<00:00, 33.5MB/s]

   Ville_id  sex  Age  Married  education_level  total_members  gained_asset  \
0        91    1   28        1               10              5      28912201   
1        57    1   23        1                8              5      28912201   
2       115    1   22        1                9              5      28912201   
3        97    1   27        1               10              4      52667108   
4        42    0   59        0               10              6      82606287   

   durable_asset  save_asset  living_expenses  other_expenses  \
0       22861940    23399979         26692283        28203066   
1       22861940    23399979         26692283        28203066   
2       22861940    23399979         26692283        28203066   
3       19698904    49647648           397715        44042267   
4       17352654    23399979         80877619        74503502   

   incoming_own_farm  incoming_business  incoming_no_business  \
0                  0                  0                     0  




## dropped Survey ID

Income own

Number Children

In [4]:
df=data.copy()
df

Unnamed: 0,Ville_id,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,91,1,28,1,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,57,1,23,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,115,1,22,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,97,1,27,1,10,4,52667108,19698904,49647648,397715,44042267,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,42,0,59,0,10,6,82606287,17352654,23399979,80877619,74503502,0,0,0,53384566,20731006,1,20100562,43419447.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,22,1,25,1,7,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1405,69,1,28,1,10,6,15711078,24023054,15506558,10476722,71588707,1,0,0,23022095,1021536,0,1823477,47384361.0,0
1406,184,1,66,0,1,1,42440731,22861940,22562605,12545372,56534257,1,0,0,12545373,10454478,0,46444572,10454478.0,1
1407,75,1,51,1,12,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0


In [5]:
X=df.drop(columns=['depressed'])
y=df['depressed']

#20% allocated for test data and 80% for train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

In [6]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [7]:
continuous_vars = df.select_dtypes(include=['int64', 'float64']).columns
Continuous_Variables_in_X_train = continuous_vars

categorical_vars = df.select_dtypes(include=['object', 'category']).columns
Categorical_Variables_in_X_train = [var for var in categorical_vars if var != 'depressed']

Date_Month_Year= df.select_dtypes(include=['int32']).columns

# Display the variables
print("Continuous Variables in X_train:")
print(Continuous_Variables_in_X_train)
print("\nCategorical Variables in X train:")
print(Categorical_Variables_in_X_train)
print("\nDate_Month_Year:")
print(Date_Month_Year)

Continuous Variables in X_train:
Index(['Ville_id', 'sex', 'Age', 'Married', 'education_level', 'total_members',
       'gained_asset', 'durable_asset', 'save_asset', 'living_expenses',
       'other_expenses', 'incoming_own_farm', 'incoming_business',
       'incoming_no_business', 'incoming_agricultural', 'farm_expenses',
       'labor_primary', 'lasting_investment', 'no_lasting_investmen',
       'depressed'],
      dtype='object')

Categorical Variables in X train:
[]

Date_Month_Year:
Index([], dtype='object')


#### checking imbalance of response

In [9]:
print ('Total not depressed :  {} and its percentage is {} %'.format(df.depressed.value_counts()[0], round(df.depressed.value_counts()[0]/df.depressed.value_counts().sum()*100,2)) )
print ('Total deperessed :  {} and its percentage is {} %'.format(df.depressed.value_counts()[1], round(df.depressed.value_counts()[1]/df.depressed.value_counts().sum()*100,2)) )

Total not depressed :  1174 and its percentage is 83.32 %
Total deperessed :  235 and its percentage is 16.68 %


## --------------------------------------------------------------------------------------------------------

In [10]:
df_backup = df.copy()
df_backup.head()

Unnamed: 0,Ville_id,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,91,1,28,1,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,57,1,23,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,115,1,22,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,97,1,27,1,10,4,52667108,19698904,49647648,397715,44042267,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,42,0,59,0,10,6,82606287,17352654,23399979,80877619,74503502,0,0,0,53384566,20731006,1,20100562,43419447.0,0


## OG Data

### Logistic Regression

In [11]:
lr = LogisticRegression()
result = lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Decision Trees

In [12]:
dt = DecisionTreeClassifier(random_state=28)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2', None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 2}
0.8322949852507374


In [13]:
dtree = DecisionTreeClassifier(max_depth = 3,  min_samples_split = 2, min_samples_leaf = 3, max_features = 'log2', random_state = 28)
dtree.fit(X_train,y_train)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  82.27 %
Misclassification rate of this model:  17.73 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     0.979     0.902       236
           1      0.167     0.022     0.038        46

    accuracy                          0.823       282
   macro avg      0.502     0.500     0.470       282
weighted avg      0.728     0.823     0.761       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  231 |                    5 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   45 |                    1 |
+-----------------+----------------------+----------------------+


### Random Forest

In [14]:
np.random.seed(28)

rf = RandomForestClassifier(random_state=28)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
0.8340766961651918


In [None]:
rf = RandomForestClassifier(n_estimators=80, max_depth=4, max_features=None, min_samples_leaf=1, min_samples_split=4, random_state=28)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Ada boosting

In [None]:
np.random.seed(28)

ada_classifier = AdaBoostClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'algorithm': 'SAMME', 'learning_rate': 0.01, 'n_estimators': 50}
Best Score: 0.8322989183874141


In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=0.01, algorithm='SAMME', random_state=28)

ada_classifier.fit(X_train, y_train)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Gradient Boosting

In [None]:
np.random.seed(28)

gb_classifier = GradientBoostingClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.8269813176007867


In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=50, learning_rate=0.05, max_depth = 3, random_state=28)
gb_classifier.fit(X_train, y_train)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.839     0.996     0.911       236
           1      0.500     0.022     0.042        46

    accuracy                          0.837       282
   macro avg      0.670     0.509     0.476       282
weighted avg      0.784     0.837     0.769       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  235 |                    1 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   45 |                    1 |
+-----------------+----------------------+----------------------+


### Multilayer Perceptron

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  77.3 %
Misclassification rate of this model:  22.7 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.841     0.898     0.869       236
           1      0.200     0.130     0.158        46

    accuracy                          0.773       282
   macro avg      0.521     0.514     0.513       282
weighted avg      0.737     0.773     0.753       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  212 |                   24 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   40 |                    6 |
+-----------------+----------------------+----------------------+


### k Nearest Neighbours

In [None]:
np.random.seed(28)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}
Best Score: 0.8323028515240904


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_train, y_train)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Support Vector Machines

In [None]:
np.random.seed(28)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_train, y_train, cv=5)
  hist.append(np.mean(cross_val))
print(hist)

[0.8322989183874141, 0.8252035398230089, 0.8305250737463128, 0.8322989183874141]


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.1)
svm_classifier.fit(X_train, y_train)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Naive Bias

In [None]:
np.random.seed(28)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'var_smoothing': 0.4328761281083058}
Best Score: 0.8323028515240904


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_train, y_train)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


## SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

resampled_counts = Counter(y_train_resampled)
print(f"Resampled class counts: Class 0 = {resampled_counts[0]}, Class 1 = {resampled_counts[1]}")

Resampled class counts: Class 0 = 938, Class 1 = 938


### Logistic regreesion

In [None]:
lr = LogisticRegression()
result = lr.fit(X_train_resampled, y_train_resampled)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  54.61 %
Misclassification rate of this model:  45.39 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.838     0.568     0.677       236
           1      0.164     0.435     0.238        46

    accuracy                          0.546       282
   macro avg      0.501     0.501     0.457       282
weighted avg      0.728     0.546     0.605       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  134 |                  102 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   26 |                   20 |
+-----------------+----------------------+----------------------+


### Decision tree

In [None]:
dt = DecisionTreeClassifier(random_state=28)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2', None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2}
0.7516468085106384


In [None]:
dtree = DecisionTreeClassifier(max_depth = None,  min_samples_split = 2, min_samples_leaf = 1, max_features = 'log2', random_state = 28)
dtree.fit(X_train_resampled, y_train_resampled)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  62.41 %
Misclassification rate of this model:  37.59 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.835     0.686     0.753       236
           1      0.159     0.304     0.209        46

    accuracy                          0.624       282
   macro avg      0.497     0.495     0.481       282
weighted avg      0.725     0.624     0.665       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  162 |                   74 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   32 |                   14 |
+-----------------+----------------------+----------------------+


### Random Forest

In [None]:
np.random.seed(28)

rf = RandomForestClassifier(random_state=28)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 150}
0.869490780141844


In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=3, random_state=28)
rf.fit(X_train_resampled, y_train_resampled)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  75.89 %
Misclassification rate of this model:  24.11 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.841     0.877     0.859       236
           1      0.194     0.152     0.171        46

    accuracy                          0.759       282
   macro avg      0.518     0.515     0.515       282
weighted avg      0.736     0.759     0.747       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  207 |                   29 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   39 |                    7 |
+-----------------+----------------------+----------------------+


### AdaBoosting

In [None]:
np.random.seed(28)

ada_classifier = AdaBoostClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 150}
Best Score: 0.75437304964539


In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=150, learning_rate=1.0, algorithm='SAMME.R', random_state=28)

ada_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  68.79 %
Misclassification rate of this model:  31.21 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.833     0.784     0.808       236
           1      0.150     0.196     0.170        46

    accuracy                          0.688       282
   macro avg      0.492     0.490     0.489       282
weighted avg      0.722     0.688     0.704       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  185 |                   51 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   37 |                    9 |
+-----------------+----------------------+----------------------+


### Gradient Boosting

In [None]:
np.random.seed(28)

gb_classifier = GradientBoostingClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100}
Best Score: 0.8465673758865251


In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.2, max_depth = 5, random_state=28)
gb_classifier.fit(X_train_resampled, y_train_resampled)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  75.18 %
Misclassification rate of this model:  24.82 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     0.873     0.855       236
           1      0.167     0.130     0.146        46

    accuracy                          0.752       282
   macro avg      0.502     0.502     0.501       282
weighted avg      0.728     0.752     0.739       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  206 |                   30 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   40 |                    6 |
+-----------------+----------------------+----------------------+


### MLP

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train_resampled)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  68.79 %
Misclassification rate of this model:  31.21 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.839     0.775     0.806       236
           1      0.172     0.239     0.200        46

    accuracy                          0.688       282
   macro avg      0.506     0.507     0.503       282
weighted avg      0.731     0.688     0.707       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  183 |                   53 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   35 |                   11 |
+-----------------+----------------------+----------------------+


### kNN

In [None]:
np.random.seed(28)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train_resampled, y_train_resampled)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Best Score: 0.767059574468085


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  55.32 %
Misclassification rate of this model:  44.68 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.816     0.602     0.693       236
           1      0.130     0.304     0.182        46

    accuracy                          0.553       282
   macro avg      0.473     0.453     0.437       282
weighted avg      0.704     0.553     0.609       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  142 |                   94 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   32 |                   14 |
+-----------------+----------------------+----------------------+


### SVM

In [None]:
np.random.seed(28)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5)
  hist.append(np.mean(cross_val))
print(hist)

[0.5538382978723404, 0.5788921985815602, 0.603936170212766, 0.5815390070921985]


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.01)
svm_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  80.5 %
Misclassification rate of this model:  19.5 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.832     0.962     0.892       236
           1      0.000     0.000     0.000        46

    accuracy                          0.805       282
   macro avg      0.416     0.481     0.446       282
weighted avg      0.696     0.805     0.746       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  227 |                    9 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Naive Bayes

In [None]:
np.random.seed(28)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train_resampled, y_train_resampled)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'var_smoothing': 9.999999999999999e-10}
Best Score: 0.5479815602836879


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  41.49 %
Misclassification rate of this model:  58.51 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.809     0.394     0.530       236
           1      0.144     0.522     0.225        46

    accuracy                          0.415       282
   macro avg      0.476     0.458     0.378       282
weighted avg      0.700     0.415     0.480       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   93 |                  143 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   22 |                   24 |
+-----------------+----------------------+----------------------+
