In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#Import the libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, make_scorer, accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight
from sklearn.feature_selection import RFE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix
from keras.models import Sequential
from tensorflow.keras.models import Sequential
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


import random
import itertools

import scipy

import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1- Data overview

In [None]:
data = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')

In [None]:
data.head()

In [None]:
print("Shape of Data:",data.shape)
r, c = data.shape
print("Number of Rows:",r)
print("Number of Columns:",c)

In [None]:
print("Information about the Dataset")
data.info()

In [None]:
print("Number of Null Values:",data.isnull().sum())

In [None]:
print("Number of Duplicate Values: ",data.duplicated().sum())

In [None]:
print("Descriptive Stastices")
data.describe()

# 2- Target variable analysis

The target is a dichotomous variable, I am going to have a look at the distribution of the two classes.

In [None]:
print(data["Bankrupt?"].value_counts())
plt.figure()
sns.countplot(x = 'Bankrupt?',data = data )
plt.show()

**Note: There is a huge imbalance between the two categories. It turns out that only 3.2% companies in this dataset bankrupted.**

# 3-  Variable-target analysis

Some companies bankrupted and some did not. However, before proceeding with the analysis I would like to see at least a small evidence that the variables have effect on the bankrupcy.

> **3.1 Non-statistical test**

Plotting the relative difference between the means of the features for both categories (bankrupted and not bankrupted).

In [None]:
#Variables' effect on class

features = data.columns[1:] #from now on "features" are interchangable with "columns"

X = data[features]
y = data["Bankrupt?"]

X_0 = X.loc[y==0,:] #not bankrupted
X_1 = X.loc[y==1,:] #bakrupted

X_0_test = X_0.sample(n=220)

significant_cols = [] #features that have "very different" means
difs=[] #differences between means

for col in X.columns:
    relative_means_difference = (X_1[col].mean() - X_0_test[col].mean()) / X_0_test[col].mean() 
    difs.append([col,relative_means_difference])
    if abs(relative_means_difference)>0.5: #tresnhold, at least 50% freater/smaller mean 
        significant_cols.append(col)


sns.barplot(x=list(range(len(difs))),y=[e[1] for e in difs])
plt.ylim((-1,5)) #this controls the size of the window displayed
plt.xlabel("Features")
plt.ylabel("Relative difference between means")
plt.show()

There are a few features with really big differences and overall around 20 features whose means are more than 50% apart in these two categories.

> **3.2 Monte Carlo Hypothesis Test**

**HYPOTHESIS**: There is a difference between bakrupted and not-bankrupted companies¶
(Null hypothesis: There is no difference between the bankrupted and not-bankrupted companies.)

I am going to generate 1000 samples, each containing 220 datapoints from X (- all datapoints) and obtain the sampling distribution of the sample mean for each feature. From the observed data (= the 220 datapoints of bankrupted companies) and sampling distribution I am going to determine the p-value.

**p-value** for each feature: percentage of sample means that are more extreme than the bankrupt companies mean

In [None]:
#MONTE CARLO HYPOTESIS TEST

from statistics import mean

sampling_distribution = {feature: [] for feature in features} #SAMPLING DISTRIBUTION OF SAMPLE MEANS for each feature
bankrupt_means = {feature: X_1[feature].mean() for feature in features} #MEAN of each feature (observed data = bankrupt companies)

for i in range(1000): #sampling from the data 1000 times
    X_sample = X.sample(n=220) #n same as the number of bankrupt companies,sampling from X
    for feature in features:
        s_mean = X_sample[feature].mean()
        sampling_distribution[feature].append(s_mean)

pvalues = {feature: None for feature in features}

def get_p_value(sampling_distribution, observed):
    l = abs(observed-mean(sampling_distribution)) #distance of observed from the sample mean
    return sum(abs(sample_mean-mean(sampling_distribution))>l for sample_mean in sampling_distribution)/len(sampling_distribution) #the proportion of data more extreme than observed
               
for feature in pvalues: #filling the pvalues dictionary
    pvalues[feature] = get_p_value(sampling_distribution[feature],bankrupt_means[feature]) 

In [None]:
print("Number of significantly different features: %d" %sum(np.array(list(pvalues.values()))>0.05))
dict(itertools.islice(pvalues.items(),10)) #look at the first 10 features and associated p-values

In [None]:
#Plotting some features and their distribution of sample means + red line with the mean of the observed data (= data of bankrupt companies)

fig, axes = plt.subplots(2,2, figsize=(15,8))

sns.distplot(sampling_distribution[" Operating Gross Margin"], ax=axes[0,0],label="sampling distribution of the mean")
axes[0,0].axvline(x=bankrupt_means[" Operating Gross Margin"],label="observation - pvalue %.2f"%pvalues[" Operating Gross Margin"],c="r")
axes[0,0].legend(loc='upper left')

sns.distplot(sampling_distribution[" Interest-bearing debt interest rate"], ax=axes[0,1])
axes[0,1].axvline(x=bankrupt_means[" Interest-bearing debt interest rate"],label="pvalue %.2f"%pvalues[" Interest-bearing debt interest rate"],c="r")
axes[0,1].legend()

sns.distplot(sampling_distribution[" Inventory/Current Liability"], ax=axes[1,0])
axes[1,0].axvline(x=bankrupt_means[" Inventory/Current Liability"],label="pvalue %.2f"%pvalues[" Inventory/Current Liability"],c="r")
axes[1,0].legend()

sns.distplot(sampling_distribution[" No-credit Interval"], ax=axes[1,1])
axes[1,1].axvline(x=bankrupt_means[" No-credit Interval"],label="pvalue %.2f"%pvalues[" No-credit Interval"],c="r")
axes[1,1].legend()

plt.legend()
plt.show()

I only exaimined the variables independently while there are probably many dependencies between them so I am not going to draw conclusions or perform feature selection based on these p-values.

# 4- Multicollinearity

I am going to find features with correlation coefficient greater than 0.9 and drop them.

In [None]:
#MULTICOLLINEARITY (CORRELATION BETWEEN PREDICTOR VARIABLES)

cor_matrix = data.corr().abs()
cor_matrix.style.background_gradient(sns.light_palette('red', as_cmap=True))

In [None]:
#Dropping correlated data

upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool)) #upper triangle of the correlation matrix

dropped_cols = set()
for feature in upper_tri.columns:
    if any(upper_tri[feature] > 0.9): #more than 0.9 corr. coeficient -> dropped
        dropped_cols.add(feature)

print("There are %d dropped columns" %len(dropped_cols))

X = X.drop(dropped_cols,axis=1)
X.head()

**PCA** is a way to decorrelate and reduce the dimensionality of the data through the change of the basis. I am going to try if the method helps to decorrelate the data.

In [None]:
#PCA

scaler = StandardScaler() 
X_for_pca = pd.DataFrame(data=scaler.fit_transform(X),index=X.index,columns=X.columns) #standardized dataset

n_components = 10

pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(X_for_pca)
X_pc = pd.DataFrame(data=principal_components, columns=['PC %d'%d for d in range(n_components)])

print("Explained variance by 10 components %.2f" %sum(pca.explained_variance_ratio_))

With 10 principal components the explained variance is still very low, so I do not find the PCA transformation useful for this data.

# 5- Data Imbalance

There is a huge imbalance between the data (only 3.2% companies from the dataset bankrupted). Before training a model I need to deal with this problem, otherwise the model would just predict every company to not bankrupt.

I decided to try two ways:

1- **Introducing weights** \ Every datapoint from the minority class is considered "more important" than from the majority class, the weights for the two classes are inversely proportional to the number of datapoints in that class. Implemented within the SVM in next section.

2- **SMOTE** \ The Synthetic Minority Over-sampling TEchnique. \ Creates new synthetic datapoints using the k-nearest neighbor algorithm. \ With this method I am going to obtain the dataset where the value counts for both categories are the same.

In [None]:
#DATA IMBALANCE
#SMOTE 

sm = SMOTE(random_state=42)

X_sm, y_sm = sm.fit_resample(X, y)

print('New balance of 1 and 0 classes (%):')
y_sm.value_counts()

In [None]:
# create list contain [accuracy,F1-measure, Recall, Precision]
mod = []
accuracy = []

Recall = []
Precision = []
F1_measure =[]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm,test_size=0.2, stratify=y_sm) #stratify adresses the unbalance only in the train test splitting
    

In [None]:
# Select  30 features
X_train_10 = X_train.iloc[:,:10]
X_test_10 = X_test.iloc[:,:10]

# Select 50 features
X_train_50 = X_train.iloc[:,:50]
X_test_50 = X_test.iloc[:,:50]

# Select 100 features
X_train_70 = X_train.iloc[:,:70]
X_test_70 = X_test.iloc[:,:70]

# 1- SVM

 am going to train a SVM model. First with SMOTE-dataset, then without SMOTE data and lastly with SMOTE-dataset but reduced to 10% of the data.

The function train_test_SVM(X,y) has multiple steps:

1- Splitting the data               
2- Assigning the weights                         
3- Creating a Pipeline                                 
4- Using GridSearchCV to find the optimal hyperparameters \ Train the model                            
5- Score                                         
6- Confusion matrix

The SVM training takes quite long (around 4 minutes for me).

* big amount of datapoints (perhaps too many for a SVM)
* GridSearchCV using cross validation for different (C, gamma) combinations
* training 'rbf' kernel is slower than linear kernel

In [None]:
#SVM

def train_test_SVM(X,X_test_):
    """Function finds the optimal hyperparameters of the SVM, plots the confusion matrix of test data, returns the model"""
    #X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, stratify=y) #stratify adresses the unbalance only in the train test splitting
    
    sw_train = class_weight.compute_sample_weight(class_weight = 'balanced', y = y_train) #when balanced sw_train = [1.1.1...1]
    
    steps = [('scaler', StandardScaler()), ('SVM', SVC(cache_size=7000))]
    pipeline = Pipeline(steps)
    
    #parameters' names must match the 'SVM' name in Pipeline followed by two underscores!
    #standard SVM hyperparameters
    param_grid = {
    'SVM__C':[0.01,0.1,1,10],
    'SVM__gamma':[0.1,0.01,0.001,0.0001],
    'SVM__kernel':['rbf']
    }
    
    f1 = make_scorer(f1_score , average='macro')
    grid = GridSearchCV(pipeline,param_grid=param_grid, cv=5, scoring=f1, verbose=0) #verbose controls the training progression display!
    grid.fit(X, y_train, SVM__sample_weight = sw_train)
    
    print("best parameters: ")
    print(grid.best_params_)
    
    model = grid.best_estimator_
    y_pred = model.predict(X_test_)
    
    print("f1 score is %.2f "%f1_score(y_test, y_pred))
    print("Precision: %.2f" %precision_score(y_test, y_pred))
    print("Recall: %.2f" %recall_score(y_test, y_pred))
    print("Precision: %.2f" %precision_score(y_test, y_pred))
    acc2 = accuracy_score(y_test, y_pred)
    print("Accuracy score for SVM Model: {:.2f} %".format(acc2*100))
    plot_confusion_matrix(model,
                         X_test_,
                         y_test,
                         values_format='d')
    mod.append('SVM')
    accuracy.append(acc2)
    F1_measure.append(f1_score(y_test, y_pred))
    Recall.append(recall_score(y_test, y_pred))
    Precision.append(precision_score(y_test, y_pred))
    return model

In [None]:
#Training and testing with SMOTE

model = train_test_SVM(X_train_10,X_test_10)

Note: Without SMOTE the performance is a way worse. The model is not "meaningless" as it would be without the weights, however I suppose the weights are simply just "not enough" for such a big imbalance.

In [None]:
#Training and testing with SMOTE

model = train_test_SVM(X_train_50,X_test_50)

In [None]:
#Training and testing with SMOTE

model = train_test_SVM(X_train_70,X_test_70)

SVM does not use all the data to make a decision boundary, that is why the model works quite good with only 10% data. And the training is much faster.

# 2- Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, plot_confusion_matrix
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_sample_weight

In [None]:
def train_test_LogisticRegression(X_train,X_test_):
    """Function finds the optimal hyperparameters of the logistic regression, plots the confusion matrix of test data, returns the model"""
    #X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, stratify=y) #stratify addresses the unbalance only in the train test splitting
    
    sw_train = compute_sample_weight(class_weight = 'balanced', y = y_train) #when balanced sw_train = [1.1.1...1]
    
    steps = [('scaler', StandardScaler()), ('LogisticRegression', LogisticRegression(solver='lbfgs', max_iter=10000))]
    pipeline = Pipeline(steps)
    
    #parameters' names must match the 'LogisticRegression' name in Pipeline followed by two underscores!
    #standard logistic regression hyperparameters
    param_grid = {
        'LogisticRegression__C': [0.01, 0.1, 1, 10],
        'LogisticRegression__penalty': ['l1', 'l2'],
        'LogisticRegression__class_weight': ['balanced', None]
    }
    
    f1 = make_scorer(f1_score , average='macro')
    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1, verbose=0) #verbose controls the training progression display!
    grid.fit(X_train, y_train, LogisticRegression__sample_weight=sw_train)
    
    print("best parameters: ")
    print(grid.best_params_)
    
    model = grid.best_estimator_
    y_pred = model.predict(X_test_)
    
    print("f1 score is %.2f "%f1_score(y_test, y_pred))
    print("Precision: %.2f" %precision_score(y_test, y_pred))
    print("Recall: %.2f" %recall_score(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy score for Logistic Regression Model: {:.2f} %".format(acc*100))
    plot_confusion_matrix(model,
                         X_test_,
                         y_test,
                         values_format='d')
    mod.append('LR')
    accuracy.append(acc)
    F1_measure.append(f1_score(y_test, y_pred))
    Recall.append(recall_score(y_test, y_pred))
    Precision.append(precision_score(y_test, y_pred))
    return model


In [None]:
#Training and testing with SMOTE

model = train_test_LogisticRegression(X_train_10,X_test_10)

In [None]:
model = train_test_LogisticRegression(X_train_50,X_test_50)

In [None]:
model = train_test_LogisticRegression(X_train_70,X_test_70)

# 3. RandomForest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def train_test_RandomForest(X, X_test_):
    """Function finds the optimal hyperparameters of the Random Forest Classifier, plots the confusion matrix of test data, and returns the model"""
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    
    sw_train = compute_sample_weight(class_weight='balanced', y=y_train)
    
    steps = [('scaler', StandardScaler()), ('RandomForest', RandomForestClassifier())]
    pipeline = Pipeline(steps)
    
    # Random Forest Classifier hyperparameters
    param_grid = {
        'RandomForest__n_estimators': [10, 20, 30],
        'RandomForest__max_depth': [None, 10, 20],
        'RandomForest__min_samples_split': [2, 5],
        'RandomForest__class_weight': ['balanced', None]
    }
    
    f1 = make_scorer(f1_score, average='macro')
    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1, verbose=0)
    grid.fit(X, y_train, RandomForest__sample_weight=sw_train)
    
    print("best parameters: ")
    print(grid.best_params_)
    
    model = grid.best_estimator_
    y_pred = model.predict(X_test_)
    
    print("f1 score is %.2f" % f1_score(y_test, y_pred))
    print("Precision: %.2f" % precision_score(y_test, y_pred))
    print("Recall: %.2f" % recall_score(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy score for Random Forest Model: {:.2f} %".format(acc*100))
    plot_confusion_matrix(model, X_test_, y_test, values_format='d')
    mod.append('RF')
    accuracy.append(acc)
    F1_measure.append(f1_score(y_test, y_pred))
    Recall.append(recall_score(y_test, y_pred))
    Precision.append(precision_score(y_test, y_pred))
    return model


In [None]:
#Training and testing with SMOTE

model = train_test_RandomForest(X_train_10,X_test_10)

In [None]:
#Training and testing with SMOTE

model = train_test_RandomForest(X_train_50,X_test_50)

In [None]:
#Training and testing with SMOTE

model = train_test_RandomForest(X_train_70,X_test_70)

# 4- NaiveBayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, plot_confusion_matrix

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, plot_confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def train_test_NaiveBayes(X, X_test_):
    """Function finds the optimal hyperparameters of the Naive Bayes classifier, plots the confusion matrix of test data, and returns the model"""
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) #stratify addresses the unbalance only in the train test splitting

    sw_train = compute_sample_weight(class_weight='balanced', y=y_train) #when balanced sw_train = [1.1.1...1]

    steps = [('scaler', StandardScaler()), ('NaiveBayes', GaussianNB())]
    pipeline = Pipeline(steps)
    grid_param = {'NaiveBayes__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

    # No hyperparameters to optimize for Naive Bayes

    f1 = make_scorer(f1_score , average='macro')
    grid = GridSearchCV(pipeline, param_grid=grid_param, cv=5, scoring=f1, verbose=0) #verbose controls the training progression display!
    grid.fit(X, y_train, NaiveBayes__sample_weight=sw_train)

    print("best parameters: ")
    print(grid.best_params_)

    model = grid.best_estimator_
    y_pred = model.predict(X_test_)

    print("f1 score is %.2f "%f1_score(y_test, y_pred))
    print("Precision: %.2f" %precision_score(y_test, y_pred))
    print("Recall: %.2f" %recall_score(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy score for Naive Bayes Model: {:.2f} %".format(acc*100))
    plot_confusion_matrix(model, X_test_, y_test, values_format='d')
    mod.append('NB')
    accuracy.append(acc)
    F1_measure.append(f1_score(y_test, y_pred))
    Recall.append(recall_score(y_test, y_pred))
    Precision.append(precision_score(y_test, y_pred))
    return model


In [None]:
#Training and testing with SMOTE

model = train_test_NaiveBayes(X_train_10,X_test_10)

In [None]:
#Training and testing with SMOTE

model = train_test_NaiveBayes(X_train_50,X_test_50)

In [None]:
#Training and testing with SMOTE

model = train_test_NaiveBayes(X_train_70,X_test_70)

# 5- DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

def train_test_DecisionTree(X, X_test_):
    """Function finds the optimal hyperparameters of the Decision Tree classifier, plots the confusion matrix of test data, and returns the model"""
    sw_train = compute_sample_weight(class_weight='balanced', y=y_train) 

    steps = [('scaler', StandardScaler()), ('DecisionTree', DecisionTreeClassifier(random_state=42))]
    pipeline = Pipeline(steps)

    grid_param = {'DecisionTree__max_depth': [5, 10, 15, 20, 25],
                  'DecisionTree__min_samples_split': [2, 5, 10, 15, 20],
                  'DecisionTree__min_samples_leaf': [1, 2, 5, 10, 15]}

    f1 = make_scorer(f1_score, average='macro')
    grid = GridSearchCV(pipeline, param_grid=grid_param, cv=5, scoring=f1, verbose=0)
    grid.fit(X, y_train, DecisionTree__sample_weight=sw_train)

    print("best parameters: ")
    print(grid.best_params_)

    model = grid.best_estimator_
    y_pred = model.predict(X_test_)

    print("f1 score is %.2f "%f1_score(y_test, y_pred))
    print("Precision: %.2f" %precision_score(y_test, y_pred))
    print("Recall: %.2f" %recall_score(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy score for Decision Tree Model: {:.2f} %".format(acc*100))
    plot_confusion_matrix(model, X_test_, y_test, values_format='d')
    mod.append('DT')
    accuracy.append(acc)
    F1_measure.append(f1_score(y_test, y_pred))
    Recall.append(recall_score(y_test, y_pred))
    Precision.append(precision_score(y_test, y_pred))
    return model


In [None]:
#Training and testing with SMOTE

model = train_test_DecisionTree(X_train_10,X_test_10)

In [None]:
model = train_test_DecisionTree(X_train_50,X_test_50)

In [None]:
model = train_test_DecisionTree(X_train_70,X_test_70)

# 6.AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
def train_test_AdaBoost(X,X_test_):
    """Function finds the optimal hyperparameters of AdaBoost, plots the confusion matrix of test data, returns the model"""
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) # stratify adresses the unbalance only in the train test splitting
    
    sw_train = class_weight.compute_sample_weight(class_weight='balanced', y=y_train) # when balanced sw_train = [1.1.1...1]
    
    pipeline = Pipeline([('scaler', StandardScaler()), ('AdaBoost', AdaBoostClassifier())])
    
    # AdaBoost hyperparameters
    param_grid = {
        'AdaBoost__n_estimators': [50, 100, 200],
        'AdaBoost__learning_rate': [0.01, 0.1, 1]
    }
    
    f1 = make_scorer(f1_score, average='macro')
    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1, verbose=0) # verbose controls the training progression display!
    grid.fit(X, y_train, AdaBoost__sample_weight=sw_train)
    
    print("best parameters: ")
    print(grid.best_params_)
    
    model = grid.best_estimator_
    y_pred = model.predict(X_test_)
    
    print("f1 score is %.2f" % f1_score(y_test, y_pred))
    print("Precision: %.2f" % precision_score(y_test, y_pred))
    print("Recall: %.2f" % recall_score(y_test, y_pred))
    print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))
    
    plot_confusion_matrix(model, X_test_, y_test, values_format='d')
    
    mod.append('AdaBoost')
    accuracy.append(accuracy_score(y_test, y_pred))
    F1_measure.append(f1_score(y_test, y_pred))
    Recall.append(recall_score(y_test, y_pred))
    Precision.append(precision_score(y_test, y_pred))
    
    return model


In [None]:
model = train_test_AdaBoost(X_train_10,X_test_10)

In [None]:
model = train_test_AdaBoost(X_train_50,X_test_50)

In [None]:
model = train_test_AdaBoost(X_train_70,X_test_70)

# 7. GradientBoostingClassifier


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

def train_test_GBT(X, X_test_):
    """Function finds the optimal hyperparameters of the Gradient Boosted Trees classifier, plots the confusion matrix of test data, and returns the model"""
    sw_train = compute_sample_weight(class_weight='balanced', y=y_train) 

    steps = [('scaler', StandardScaler()), ('GBT', GradientBoostingClassifier(random_state=42))]
    pipeline = Pipeline(steps)

    grid_param = {'GBT__n_estimators': [50, 100, 200],
                  'GBT__learning_rate': [0.1, 0.05, 0.01],
                  'GBT__max_depth': [3, 5, 7]}

    f1 = make_scorer(f1_score, average='macro')
    grid = GridSearchCV(pipeline, param_grid=grid_param, cv=5, scoring=f1, verbose=0)
    grid.fit(X, y_train, GBT__sample_weight=sw_train)

    print("best parameters: ")
    print(grid.best_params_)

    model = grid.best_estimator_
    y_pred = model.predict(X_test_)

    print("f1 score is %.2f "%f1_score(y_test, y_pred))
    print("Precision: %.2f" %precision_score(y_test, y_pred))
    print("Recall: %.2f" %recall_score(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy score for Gradient Boosted Trees Model: {:.2f} %".format(acc*100))
    plot_confusion_matrix(model, X_test_, y_test, values_format='d')
    mod.append('GBT')
    accuracy.append(acc)
    F1_measure.append(f1_score(y_test, y_pred))
    Recall.append(recall_score(y_test, y_pred))
    Precision.append(precision_score(y_test, y_pred))
    return model


In [None]:
model = train_test_GBT(X_train_10,X_test_10)

In [None]:
model = train_test_GBT(X_train_50,X_test_50)

In [None]:
model = train_test_GBT(X_train_70,X_test_70)

In [None]:
df = pd.DataFrame({'Classifier': mod, 'Accuracy': accuracy, 'F1-Measure': F1_measure,'Recall': Recall,'Precision':Precision})


In [None]:
df

In [None]:
df2=df

In [None]:
# Create a dictionary to map the old Classifier names to the new names

# Add a new column with the SMOTE information
df2['Feature'] = ['_50' if i%3==1 else '_70' if i%3==2 else '_10' for i in range(len(df2))]

# Merge the Classifier and SMOTE columns to get the desired output
df2['Classifier'] = df2['Classifier'] +  df2['Feature']
# Drop the SMOTE column
df2 = df2.drop('Feature', axis=1)
# Print the updated dataframe
df2


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Split the dataframe into groups of 3 rows
groups = df2.groupby(df2.index // 3)

# Iterate over the groups and create individual charts
for i, group in groups:
    ax = group.plot(kind='bar', figsize=(10,6))
    ax.set_title(f'Classifier Comparison')
    ax.set_xlabel('Classifier')
    ax.set_ylabel('Score')
    ax.set_xticklabels(group["Classifier"],rotation=0)
    plt.tight_layout()  # add this line to adjust spacing
    plt.tight_layout()  # add this line to adjust spacing
    plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt



# Set the index of the dataframe to the Classifier column
df2.set_index('Classifier', inplace=True)

# Plot the bar chart
ax = df2.plot(kind='bar', figsize=(10,6))

# Set the title, x-label, and y-label of the plot
ax.set_title('Classifier Comparison')
ax.set_xlabel('Classifier')
ax.set_ylabel('Score')

# Show the plot
plt.show()
