In [3]:
# Standard libraries
import sys  # System-specific parameters and functions
import os   # Miscellaneous operating system interfaces
import warnings  # Warning control
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations

# Visualization
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Statistical data visualization
from matplotlib.colors import ListedColormap  # Colormap utilities

# Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler  # Preprocessing tools
from sklearn import model_selection, metrics, preprocessing  # Model selection, evaluation, and preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # Model selection and evaluation
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier  # Ensemble methods
from xgboost import XGBClassifier  # Extreme Gradient Boosting
from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron

# Statistical analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Variance inflation factor
from scipy.stats import pointbiserialr, chi2_contingency, spearmanr, entropy  # Statistical functions
from statsmodels.graphics.gofplots import qqplot  # Q-Q plot
from collections import Counter  # Container datatypes

# Tabulate
from tabulate import tabulate

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# Balancing techniques
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [4]:
#import encoded dataset
import io

df = pd.read_csv('Cleaned2.csv')
df.head()

Unnamed: 0,Ville_id,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,91,1,28,1,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,57,1,23,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,115,1,22,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,97,1,27,1,10,4,52667108,19698904,49647648,397715,44042267,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,42,0,59,0,10,6,82606287,17352654,23399979,80877619,74503502,0,0,0,53384566,20731006,1,20100562,43419447.0,0


In [5]:
df.dtypes

Unnamed: 0,0
Ville_id,int64
sex,int64
Age,int64
Married,int64
education_level,int64
total_members,int64
gained_asset,int64
durable_asset,int64
save_asset,int64
living_expenses,int64


In [6]:
#change dtypes sex,Village_id,Married,education_level,incoming_own_farm,incoming_business,incoming_no_business,labor_primary,depressed into catergory
df['sex'] = df['sex'].astype('category')
df['Ville_id'] = df['Ville_id'].astype('category')
df['Married'] = df['Married'].astype('category')
df['education_level'] = df['education_level'].astype('category')
df['incoming_own_farm'] = df['incoming_own_farm'].astype('category')
df['incoming_business'] = df['incoming_business'].astype('category')
df['incoming_no_business'] = df['incoming_no_business'].astype('category')
df['labor_primary'] = df['labor_primary'].astype('category')
df['depressed'] = df['depressed'].astype('category')


In [7]:
df

Unnamed: 0,Ville_id,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,91,1,28,1,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,57,1,23,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,115,1,22,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,97,1,27,1,10,4,52667108,19698904,49647648,397715,44042267,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,42,0,59,0,10,6,82606287,17352654,23399979,80877619,74503502,0,0,0,53384566,20731006,1,20100562,43419447.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,22,1,25,1,7,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1405,69,1,28,1,10,6,15711078,24023054,15506558,10476722,71588707,1,0,0,23022095,1021536,0,1823477,47384361.0,0
1406,184,1,66,0,1,1,42440731,22861940,22562605,12545372,56534257,1,0,0,12545373,10454478,0,46444572,10454478.0,1
1407,75,1,51,1,12,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0


In [8]:
X=df.drop(columns=['depressed'])
y=df['depressed']

#20% allocated for test data and 80% for train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

In [9]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [10]:
continuous_vars = df.select_dtypes(include=['int64', 'float64']).columns
Continuous_Variables_in_X_train = continuous_vars

categorical_vars = df.select_dtypes(include=['object', 'category']).columns
Categorical_Variables_in_X_train = [var for var in categorical_vars if var != 'depressed']

Date_Month_Year= df.select_dtypes(include=['int32']).columns

# Display the variables
print("Continuous Variables in X_train:")
print(Continuous_Variables_in_X_train)
print("\nCategorical Variables in X train:")
print(Categorical_Variables_in_X_train)
print("\nDate_Month_Year:")
print(Date_Month_Year)

Continuous Variables in X_train:
Index(['Age', 'total_members', 'gained_asset', 'durable_asset', 'save_asset',
       'living_expenses', 'other_expenses', 'incoming_agricultural',
       'farm_expenses', 'lasting_investment', 'no_lasting_investmen'],
      dtype='object')

Categorical Variables in X train:
['Ville_id', 'sex', 'Married', 'education_level', 'incoming_own_farm', 'incoming_business', 'incoming_no_business', 'labor_primary']

Date_Month_Year:
Index([], dtype='object')


#### checking imbalance of response

In [11]:
print ('Total not committed fraud :  {} and its percentage is {} %'.format(df.depressed.value_counts()[0], round(df.depressed.value_counts()[0]/df.depressed.value_counts().sum()*100,2)) )
print ('Total committed fraud :  {} and its percentage is {} %'.format(df.depressed.value_counts()[1], round(df.depressed.value_counts()[1]/df.depressed.value_counts().sum()*100,2)) )

Total not committed fraud :  1174 and its percentage is 83.32 %
Total committed fraud :  235 and its percentage is 16.68 %


## --------------------------------------------------------------------------------------------------------

In [12]:
df_backup = df.copy()
df_backup.head()

Unnamed: 0,Ville_id,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,91,1,28,1,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,57,1,23,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,115,1,22,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,97,1,27,1,10,4,52667108,19698904,49647648,397715,44042267,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,42,0,59,0,10,6,82606287,17352654,23399979,80877619,74503502,0,0,0,53384566,20731006,1,20100562,43419447.0,0


## OG Data

### Logistic Regression

In [None]:
lr = LogisticRegression()
result = lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Decision Trees

In [None]:
dt = DecisionTreeClassifier(random_state=28)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2', None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 2}
0.8322949852507374


In [None]:
dtree = DecisionTreeClassifier(max_depth = 3,  min_samples_split = 2, min_samples_leaf = 3, max_features = 'log2', random_state = 28)
dtree.fit(X_train,y_train)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  82.27 %
Misclassification rate of this model:  17.73 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     0.979     0.902       236
           1      0.167     0.022     0.038        46

    accuracy                          0.823       282
   macro avg      0.502     0.500     0.470       282
weighted avg      0.728     0.823     0.761       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  231 |                    5 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   45 |                    1 |
+-----------------+----------------------+----------------------+


*italicized text*### Random Forest

In [None]:
np.random.seed(28)

rf = RandomForestClassifier(random_state=28)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
0.8340766961651918


In [None]:
rf = RandomForestClassifier(n_estimators=80, max_depth=None, max_features="log2", min_samples_leaf=2, min_samples_split=2, random_state=28)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  84.04 %
Misclassification rate of this model:  15.96 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.842     0.996     0.913       236
           1      0.667     0.043     0.082        46

    accuracy                          0.840       282
   macro avg      0.754     0.520     0.497       282
weighted avg      0.814     0.840     0.777       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  235 |                    1 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   44 |                    2 |
+-----------------+----------------------+----------------------+


### Ada boosting

In [None]:
np.random.seed(28)

ada_classifier = AdaBoostClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'algorithm': 'SAMME', 'learning_rate': 0.01, 'n_estimators': 50}
Best Score: 0.8322989183874141


In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=0.01, algorithm='SAMME', random_state=28)

ada_classifier.fit(X_train, y_train)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Gradient Boosting

In [13]:
np.random.seed(28)

gb_classifier = GradientBoostingClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 80}
Best Score: 0.8296401179941004


In [19]:
gb_classifier = GradientBoostingClassifier(n_estimators=80, learning_rate=0.05, max_depth = 3, random_state=28)
gb_classifier.fit(X_train, y_train)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  84.4 %
Misclassification rate of this model:  15.6 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.845     0.996     0.914       236
           1      0.750     0.065     0.120        46

    accuracy                          0.844       282
   macro avg      0.798     0.530     0.517       282
weighted avg      0.830     0.844     0.785       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  235 |                    1 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   43 |                    3 |
+-----------------+----------------------+----------------------+


### Multilayer Perceptron

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [15]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  79.43 %
Misclassification rate of this model:  20.57 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.848     0.919     0.882       236
           1      0.269     0.152     0.194        46

    accuracy                          0.794       282
   macro avg      0.558     0.536     0.538       282
weighted avg      0.753     0.794     0.770       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  217 |                   19 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   39 |                    7 |
+-----------------+----------------------+----------------------+


### k Nearest Neighbours

In [16]:
np.random.seed(28)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'}
Best Score: 0.8323028515240904


In [20]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_train, y_train)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Support Vector Machines

In [17]:
np.random.seed(28)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_train, y_train, cv=5)
  hist.append(np.mean(cross_val))
print(hist)

[0.8322989183874141, 0.8322989183874141, 0.8305250737463128, 0.831410029498525]


In [21]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.01)
svm_classifier.fit(X_train, y_train)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


### Naive Bias

In [18]:
np.random.seed(28)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'var_smoothing': 0.4328761281083058}
Best Score: 0.8323028515240904


In [22]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_train, y_train)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  83.69 %
Misclassification rate of this model:  16.31 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     1.000     0.911       236
           1      0.000     0.000     0.000        46

    accuracy                          0.837       282
   macro avg      0.418     0.500     0.456       282
weighted avg      0.700     0.837     0.763       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  236 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   46 |                    0 |
+-----------------+----------------------+----------------------+


## SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

resampled_counts = Counter(y_train_resampled)
print(f"Resampled class counts: Class 0 = {resampled_counts[0]}, Class 1 = {resampled_counts[1]}")

### Logistic regreesion

In [None]:
lr = LogisticRegression()
result = lr.fit(X_train_resampled, y_train_resampled)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### Decision tree

In [None]:
dt = DecisionTreeClassifier(random_state=28)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2', None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

In [None]:
dtree = DecisionTreeClassifier(max_depth = None,  min_samples_split = 2, min_samples_leaf = 1, max_features = 'log2', random_state = 28)
dtree.fit(X_train_resampled, y_train_resampled)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### Random Forest

In [None]:
np.random.seed(28)

rf = RandomForestClassifier(random_state=28)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=3, random_state=28)
rf.fit(X_train_resampled, y_train_resampled)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### AdaBoosting

In [None]:
np.random.seed(28)

ada_classifier = AdaBoostClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=150, learning_rate=1.0, algorithm='SAMME.R', random_state=28)

ada_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### Gradient Boosting

In [None]:
np.random.seed(28)

gb_classifier = GradientBoostingClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.2, max_depth = 5, random_state=28)
gb_classifier.fit(X_train_resampled, y_train_resampled)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### MLP

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train_resampled)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### kNN

In [None]:
np.random.seed(28)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train_resampled, y_train_resampled)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### SVM

In [None]:
np.random.seed(28)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5)
  hist.append(np.mean(cross_val))
print(hist)

In [None]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.01)
svm_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### Naive Bayes

In [None]:
np.random.seed(28)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train_resampled, y_train_resampled)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### XGBoost

In [None]:
np.random.seed(28)

xgb_classifier = XGBClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

In [None]:
xgb_classifier = XGBClassifier(n_estimators=200, learning_rate=0.2, max_depth = None, random_state=28)
xgb_classifier.fit(X_train_resampled, y_train_resampled)
y_pred = xgb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  74.11 %
Misclassification rate of this model:  25.89 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.844     0.847     0.846       236
           1      0.200     0.196     0.198        46

    accuracy                          0.741       282
   macro avg      0.522     0.522     0.522       282
weighted avg      0.739     0.741     0.740       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  200 |                   36 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   37 |                    9 |
+-----------------+----------------------+----------------------+
