In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time
import os

from datetime import datetime

import shap
import lime
from lime import lime_tabular

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import metrics
from sklearn.metrics import confusion_matrix, cohen_kappa_score

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

from sklearn.cluster import KMeans

import missingno as msno

from fancyimpute import IterativeImputer as MICE
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam 


from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from collections import Counter

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

from imblearn.over_sampling import KMeansSMOTE
from sklearn.mixture import GaussianMixture


from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from joblib import dump, load
import logging


In [2]:

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def split_dataset(dataset, target_column, test_size=0.2):
    
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    logging.info("Dataset has been split and returned")
    return X_train, X_test, y_train, y_test

# This is given separately becuase, the other models are trained using prebuilt libraries
def train_ann(X_train, y_train):
    
    start_time = time.time()
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(12, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
    end_time = time.time()

    logging.info(f"ANN has been trained in {end_time - start_time:.2f} seconds")
    return model

def train_models(X_train, y_train):
    
    models = {}
    param_grids = {
        'RandomForest': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        },
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        'LogisticRegression': {
            'C': [0.1, 1, 10],
            'penalty': ['l2']
        },
        'GradientBoosting': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7]
        },
        'KNN': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }

    models['ANN'] = train_ann(X_train, y_train)

    for model_name, param_grid in param_grids.items():
        start_time = time.time()
        try:
            if model_name == 'RandomForest':
                model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
            elif model_name == 'XGBoost':
                model = GridSearchCV(XGBClassifier(), param_grid, cv=5)
            elif model_name == 'SVM':
                model = GridSearchCV(SVC(probability=True), param_grid, cv=5)
            elif model_name == 'LogisticRegression':
                model = GridSearchCV(LogisticRegression(), param_grid, cv=5)
            elif model_name == 'GradientBoosting':
                model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
            elif model_name == 'KNN':
                model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

            model.fit(X_train, y_train)
            models[model_name] = model.best_estimator_
            end_time = time.time()
            logging.info(f"{model_name} has been trained in {end_time - start_time:.2f} seconds")
        except Exception as e:
            logging.error(f"Error training {model_name}: {e}")

    try:
        start_time = time.time()
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        models['NaiveBayes'] = nb
        end_time = time.time()
        logging.info(f"Naive Bayes has been trained in {end_time - start_time:.2f} seconds")
    except Exception as e:
        logging.error(f"Error training Naive Bayes: {e}")

    return models

def test_models(models, X_test):
    
    start_time = time.time()
    predictions = {}
    for name, model in models.items():
        try:
            if name == 'ANN':
                predictions[name] = (model.predict(X_test) > 0.5).astype("int32")
            else:
                predictions[name] = model.predict(X_test)
        except Exception as e:
            logging.error(f"Error testing {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been tested in {end_time - start_time:.2f} seconds")
    return predictions


def evaluate_models(models, predictions, y_test, X_test):
    
    start_time = time.time()
    metrics = {}
    
    for name, y_pred in predictions.items():
        try:
            accuracy = accuracy_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            
            if hasattr(models[name], "predict_proba"):
                auc = roc_auc_score(y_test, models[name].predict_proba(X_test)[:, 1])
            else:
                auc = roc_auc_score(y_test, models[name].predict(X_test))
            
            # Calculate specificity
            if cm.shape == (2, 2):
                tn, fp, fn, tp = cm.ravel()
                specificity = tn / (tn + fp)
            else:
                specificity = 0  # or handle the case appropriately
            
            # Calculate G-mean
            g_mean = np.sqrt(recall * specificity)
            
            # Calculate Kappa statistic
            kappa = cohen_kappa_score(y_test, y_pred)
            
            metrics[name] = {
                'accuracy': accuracy,
                'confusion_matrix': cm,
                'f1_score': f1,
                'precision': precision,
                'recall': recall,
                'auc_roc': auc,
                'g_mean': g_mean,
                'kappa': kappa
            }
        except Exception as e:
            logging.error(f"Error evaluating {name}: {e}")
    
    end_time = time.time()
    logging.info(f"Models have been evaluated in {end_time - start_time:.2f} seconds")
    
    return metrics


def explainability_shap(models, df_name, X_test, feature_names):

    
    # Ensure X_test is a DataFrame with named columns
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                explainer = shap.TreeExplainer(model)
            
            # No existing methods to analyse other models using SHAP, so only these three models.
            
            shap_values = explainer.shap_values(X_test)
            
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values[1] if isinstance(shap_values, list) else shap_values, 
                              X_test, plot_type="bar", show=False, max_display=10)
            plt.title(f"Top 10 Most Important Features - {name}")
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_shap_importance_{name}.png")
            plt.close()
            logging.info(f"SHAP explanations for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating SHAP explanations for {name}: {e}")



def explainability_lime(models, df_name, X_train, X_test, feature_names):
    
    
    # Ensure X_train and X_test are DataFrames with named columns
    X_train = pd.DataFrame(X_train, columns=feature_names).reset_index(drop=True)
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train.values,  # Use .values to get numpy array
        feature_names=feature_names, 
        class_names=['Negative', 'Positive'], 
        mode='classification'
    )
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            i = np.random.randint(0, X_test.shape[0])
            exp = explainer.explain_instance(
                X_test.iloc[i].values,  # Use .iloc[i].values to get numpy array
                model.predict_proba, 
                num_features=6
            )
            feature_importance = pd.DataFrame(exp.as_list(), columns=['Feature', 'Importance'])
            feature_importance['Absolute Importance'] = abs(feature_importance['Importance'])
            feature_importance = feature_importance.sort_values('Absolute Importance', ascending=True)
            plt.figure(figsize=(10, 6))
            colors = ['red' if imp < 0 else 'green' for imp in feature_importance['Importance']]
            plt.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
            plt.title(f"LIME Explanation for {name}\nTop 6 Features' Impact on Prediction")
            plt.xlabel('Impact on Prediction (Red = Negative, Green = Positive)')
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_lime_explanation_{name}.png")
            plt.close()
            logging.info(f"LIME explanation for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating LIME explanations for {name}: {e}")



def interpret_results(models, X_test, feature_names):

    
    summary = "Model Interpretation Summary:\n\n"
    for name, model in models.items():
        if name == 'ANN':
            continue
        summary += f"{name} Model:\n"
        summary += f"Feature Importance from {name} Model:\n"
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                importances = model.feature_importances_
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            else:
                importances = model.coef_[0] if hasattr(model, 'coef_') else None
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            summary += importance_df.to_string(index=False)
            summary += "\n\n"
        except Exception as e:
            logging.error(f"Error interpreting results for {name}: {e}")
    logging.info("Model interpretation summary created")
    return summary


def save_models(models, directory='models'):
    
    if not os.path.exists(directory):
        os.makedirs(directory)
    for name, model in models.items():
        try:
            if name == 'ANN':
                model.save(os.path.join(directory, f'{name}_model.h5'))
            else:
                dump(model, os.path.join(directory, f'{name}_model.joblib'))
            logging.info(f"{name} model saved")
        except Exception as e:
            logging.error(f"Error saving {name} model: {e}")


# Use only if needed to run back with best models
def load_models(directory='models'):
    
    models = {}
    for filename in os.listdir(directory):
        model_name, ext = os.path.splitext(filename)
        try:
            if ext == '.h5':
                models[model_name] = load_model(os.path.join(directory, filename))
            elif ext == '.joblib':
                models[model_name] = load(os.path.join(directory, filename))
            logging.info(f"{model_name} model loaded")
        except Exception as e:
            logging.error(f"Error loading {model_name} model: {e}")
    return models


def main(dataset, target_column, name):
    
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    logging.info("Data has been standardized")

    models = train_models(X_train, y_train)
    predictions = test_models(models, X_test)
    metrics = evaluate_models(models, predictions, y_test, X_test)

    explainability_shap(models, name, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    explainability_lime(models, name, X_train, X_test, feature_names=dataset.drop(columns=[target_column]).columns)

    # save_models(models)
    logging.info("Models have been saved")

    # Interpret results
    summary = interpret_results(models, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    print(summary)

    return metrics


def modelling_gs(df, name):
    
    target_column = 'LABEL'  # Replace with your target column
    results = main(df, target_column, name)
    logging.info("Results have been documented.")
    return results

# To run the modelling function with a dataset 'df':
# results = modelling_gs(df)

In [3]:
file_paths = [
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\ADASYN_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\ADASYN_MICE_RF_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\KMSMOTE_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\KMSMOTE_MICE_RF_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\SVMSMOTE_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\SVMSMOTE_MICE_RF_3_PCA.xlsx"
]

# Read the Excel files into dataframes
dfs = [pd.read_excel(file_path) for file_path in file_paths]

print("Datasets are read into dataframes")

tot_start_time = time.time()
start_time = time.time()
# Store results in variables
results_ADASYN_AE_3_PCA = modelling_gs(dfs[0], "ADASYN_AE_3_PCA" )
end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by ADASYN_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_ADASYN_MICE_3_PCA = modelling_gs(dfs[1], "ADASYN_MICE_RF_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by ADASYN_MICE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_KMSMOTE_AE_3_PCA = modelling_gs(dfs[2], "KMSMOTE_AE_3_PCA")

end_time = time.time()  #End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by KMSMOTE_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_KMSMOTE_MICE_3_PCA = modelling_gs(dfs[3], "KMSMOTE_MICE_RF_3_PCA")

end_time = time.time()  # end timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by KMSMOTE_MICE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_SVMSMOTE_AE_3_PCA = modelling_gs(dfs[4], "SVMSMOTE_AE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by SVMSMOTE_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_SVMSMOTE_MICE_3_PCA = modelling_gs(dfs[5], "SVMSMOTE_MICE_RF_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by SVMSMOTE_MICE_3_PCA: {elapsed_time:.2f} mins")


print(" ")
print("_______________________________________________________________________________")
tot_end_time = time.time()  # End timing
tot_elapsed_time = (tot_end_time - tot_start_time) / 60
print(f" Total time taken by all the models : {tot_elapsed_time:.2f} mins")

# Print the all final results with variable names
print("Results for ADASYN_AE_3_PCA:", results_ADASYN_AE_3_PCA)
print("Results for ADASYN_MICE_3_PCA:", results_ADASYN_MICE_3_PCA)
print("Results for KMSMOTE_AE_3_PCA:", results_KMSMOTE_AE_3_PCA)
print("Results for KMSMOTE_MICE_3_PCA:", results_KMSMOTE_MICE_3_PCA)
print("Results for SVMSMOTE_AE_3_PCA:", results_SVMSMOTE_AE_3_PCA)
print("Results for SVMSMOTE_MICE_3_PCA:", results_SVMSMOTE_MICE_3_PCA)

2024-07-22 19:38:30,825 - INFO - Dataset has been split and returned
2024-07-22 19:38:30,833 - INFO - Data has been standardized


Datasets are read into dataframes


2024-07-22 19:41:26,504 - INFO - ANN has been trained in 175.67 seconds
2024-07-22 20:00:24,599 - INFO - RandomForest has been trained in 1138.09 seconds
2024-07-22 20:00:39,535 - INFO - XGBoost has been trained in 14.94 seconds
2024-07-22 20:20:01,899 - INFO - SVM has been trained in 1162.36 seconds
2024-07-22 20:20:02,548 - INFO - LogisticRegression has been trained in 0.65 seconds
2024-07-22 21:10:33,321 - INFO - GradientBoosting has been trained in 3030.77 seconds
2024-07-22 21:10:36,352 - INFO - KNN has been trained in 3.03 seconds
2024-07-22 21:10:36,364 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 648us/step


2024-07-22 21:10:39,627 - INFO - Models have been tested in 3.26 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step


2024-07-22 21:10:42,969 - INFO - Models have been evaluated in 3.34 seconds
2024-07-22 21:12:27,140 - INFO - SHAP explanations for RandomForest created and saved
2024-07-22 21:12:28,644 - INFO - SHAP explanations for XGBoost created and saved
2024-07-22 21:12:30,096 - INFO - SHAP explanations for SVM created and saved
2024-07-22 21:12:31,551 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-22 21:12:52,401 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-22 21:13:13,315 - INFO - SHAP explanations for KNN created and saved
2024-07-22 21:13:34,049 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-22 21:13:34,455 - INFO - LIME explanation for RandomForest created and saved
2024-07-22 21:13:34,761 - INFO - LIME explanation for XGBoost created and saved
2024-07-22 21:13:37,726 - INFO - LIME explanation for SVM created and saved
2024-07-22 21:13:38,009 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
Liquidity_and_Coverage_Ratios_PC1    0.200358
              Leverage_Ratios_PC1    0.194320
      Cost_and_Expense_Ratios_PC1    0.177273
      Cost_and_Expense_Ratios_PC2    0.097519
Liquidity_and_Coverage_Ratios_PC2    0.056907
         Profitability_Ratios_PC1    0.036241
              Activity_Ratios_PC1    0.030568
             Cash_Flow_Ratios_PC1    0.028284
             Cash_Flow_Ratios_PC2    0.026895
         Profitability_Ratios_PC2    0.025529

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.416653
      Cost_and_Expense_Ratios_PC1    0.090932
      Cost_and_Expense_Ratios_PC2    0.081417
Liquidity_and_Coverage_Ratios_PC2    0.051538
Liquidity_and_Coverage_Ratios_PC1    0.051497
              Activity_Ratios_PC1    0.039150
         Profitability

2024-07-22 21:16:32,506 - INFO - ANN has been trained in 173.30 seconds
2024-07-22 21:36:01,842 - INFO - RandomForest has been trained in 1169.34 seconds
2024-07-22 21:36:16,381 - INFO - XGBoost has been trained in 14.54 seconds
2024-07-22 21:56:35,709 - INFO - SVM has been trained in 1219.33 seconds
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2024-07-22 21:56:36,572 - INFO - LogisticRegression has been trained in 0.86 seconds
2024-07-22 22:46:27,442 - INFO - GradientBoosting has been trained in 2990.87 seconds
2024-07-22 22:46:30,243 - INFO - KNN has been trained in 2.80 seconds
2024-07-22 22:46:30,251 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 654us/step


2024-07-22 22:46:33,344 - INFO - Models have been tested in 3.09 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 548us/step


2024-07-22 22:46:36,545 - INFO - Models have been evaluated in 3.20 seconds
2024-07-22 22:50:29,353 - INFO - SHAP explanations for RandomForest created and saved
2024-07-22 22:50:31,246 - INFO - SHAP explanations for XGBoost created and saved
2024-07-22 22:50:32,648 - INFO - SHAP explanations for SVM created and saved
2024-07-22 22:50:34,052 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-22 22:50:54,889 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-22 22:51:15,677 - INFO - SHAP explanations for KNN created and saved
2024-07-22 22:51:36,484 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-22 22:51:36,905 - INFO - LIME explanation for RandomForest created and saved
2024-07-22 22:51:37,210 - INFO - LIME explanation for XGBoost created and saved
2024-07-22 22:51:39,884 - INFO - LIME explanation for SVM created and saved
2024-07-22 22:51:40,158 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.222312
      Cost_and_Expense_Ratios_PC1    0.197523
Liquidity_and_Coverage_Ratios_PC1    0.136176
      Cost_and_Expense_Ratios_PC2    0.097559
Liquidity_and_Coverage_Ratios_PC2    0.050811
              Leverage_Ratios_PC2    0.044592
         Profitability_Ratios_PC1    0.035438
              Activity_Ratios_PC1    0.028526
             Cash_Flow_Ratios_PC1    0.026148
              Activity_Ratios_PC2    0.026077

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.434680
      Cost_and_Expense_Ratios_PC1    0.155829
Liquidity_and_Coverage_Ratios_PC2    0.058382
Liquidity_and_Coverage_Ratios_PC1    0.056347
             Cash_Flow_Ratios_PC1    0.035663
              Activity_Ratios_PC1    0.030532
             Per_Share

2024-07-22 22:54:35,629 - INFO - ANN has been trained in 174.28 seconds
2024-07-22 23:13:08,003 - INFO - RandomForest has been trained in 1112.37 seconds
2024-07-22 23:13:21,551 - INFO - XGBoost has been trained in 13.55 seconds
2024-07-22 23:27:39,188 - INFO - SVM has been trained in 857.64 seconds
2024-07-22 23:27:39,874 - INFO - LogisticRegression has been trained in 0.69 seconds
2024-07-23 00:17:40,787 - INFO - GradientBoosting has been trained in 3000.91 seconds
2024-07-23 00:17:43,589 - INFO - KNN has been trained in 2.80 seconds
2024-07-23 00:17:43,597 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step


2024-07-23 00:17:46,242 - INFO - Models have been tested in 2.64 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 519us/step


2024-07-23 00:17:48,949 - INFO - Models have been evaluated in 2.71 seconds
2024-07-23 00:21:03,378 - INFO - SHAP explanations for RandomForest created and saved
2024-07-23 00:21:04,802 - INFO - SHAP explanations for XGBoost created and saved
2024-07-23 00:21:06,133 - INFO - SHAP explanations for SVM created and saved
2024-07-23 00:21:07,486 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-23 00:21:28,684 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-23 00:21:49,765 - INFO - SHAP explanations for KNN created and saved
2024-07-23 00:22:10,790 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-23 00:22:11,208 - INFO - LIME explanation for RandomForest created and saved
2024-07-23 00:22:11,508 - INFO - LIME explanation for XGBoost created and saved
2024-07-23 00:22:13,615 - INFO - LIME explanation for SVM created and saved
2024-07-23 00:22:13,894 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.272774
Liquidity_and_Coverage_Ratios_PC1    0.189410
      Cost_and_Expense_Ratios_PC1    0.161514
Liquidity_and_Coverage_Ratios_PC2    0.076516
      Cost_and_Expense_Ratios_PC2    0.074826
         Profitability_Ratios_PC1    0.036442
             Cash_Flow_Ratios_PC2    0.030916
             Cash_Flow_Ratios_PC1    0.023327
              Activity_Ratios_PC2    0.021369
              Activity_Ratios_PC1    0.021331

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.578870
      Cost_and_Expense_Ratios_PC1    0.096509
Liquidity_and_Coverage_Ratios_PC1    0.043744
              Activity_Ratios_PC1    0.035837
Liquidity_and_Coverage_Ratios_PC2    0.034951
              Activity_Ratios_PC2    0.032172
             Cash_Flow

2024-07-23 00:25:08,859 - INFO - ANN has been trained in 173.62 seconds
2024-07-23 00:43:55,599 - INFO - RandomForest has been trained in 1126.74 seconds
2024-07-23 00:44:09,147 - INFO - XGBoost has been trained in 13.55 seconds
2024-07-23 00:58:37,938 - INFO - SVM has been trained in 868.79 seconds
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2024-07-23 00:58:38,761 - INFO - LogisticRegression has been trained in 0.82 seconds
2024-07-23 01:48:43,407 - INFO - GradientBoosting has been trained in 3004.65 seconds
2024-07-23 01:48:46,234 - INFO - KNN has been trained in 2.83 seconds
2024-07-23 01:48:46,241 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 671us/step


2024-07-23 01:48:48,633 - INFO - Models have been tested in 2.39 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step


2024-07-23 01:48:51,082 - INFO - Models have been evaluated in 2.45 seconds
2024-07-23 01:51:57,967 - INFO - SHAP explanations for RandomForest created and saved
2024-07-23 01:51:59,397 - INFO - SHAP explanations for XGBoost created and saved
2024-07-23 01:52:00,751 - INFO - SHAP explanations for SVM created and saved
2024-07-23 01:52:02,109 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-23 01:52:22,769 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-23 01:52:43,522 - INFO - SHAP explanations for KNN created and saved
2024-07-23 01:53:04,118 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-23 01:53:04,539 - INFO - LIME explanation for RandomForest created and saved
2024-07-23 01:53:04,843 - INFO - LIME explanation for XGBoost created and saved
2024-07-23 01:53:06,838 - INFO - LIME explanation for SVM created and saved
2024-07-23 01:53:07,110 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.295880
Liquidity_and_Coverage_Ratios_PC1    0.179006
      Cost_and_Expense_Ratios_PC1    0.138214
Liquidity_and_Coverage_Ratios_PC2    0.066711
      Cost_and_Expense_Ratios_PC2    0.063595
         Profitability_Ratios_PC1    0.045965
              Activity_Ratios_PC2    0.043580
             Cash_Flow_Ratios_PC2    0.029392
              Activity_Ratios_PC1    0.027914
         Profitability_Ratios_PC2    0.019790

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.623866
      Cost_and_Expense_Ratios_PC1    0.082495
              Activity_Ratios_PC2    0.037848
Liquidity_and_Coverage_Ratios_PC1    0.035664
Liquidity_and_Coverage_Ratios_PC2    0.029252
              Activity_Ratios_PC1    0.026884
             Cash_Flow

2024-07-23 01:56:02,407 - INFO - ANN has been trained in 174.12 seconds
2024-07-23 02:13:29,891 - INFO - RandomForest has been trained in 1047.48 seconds
2024-07-23 02:13:43,355 - INFO - XGBoost has been trained in 13.46 seconds
2024-07-23 02:26:38,493 - INFO - SVM has been trained in 775.14 seconds
2024-07-23 02:26:39,092 - INFO - LogisticRegression has been trained in 0.60 seconds
2024-07-23 03:16:30,991 - INFO - GradientBoosting has been trained in 2991.90 seconds
2024-07-23 03:16:33,806 - INFO - KNN has been trained in 2.81 seconds
2024-07-23 03:16:33,813 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 719us/step


2024-07-23 03:16:36,310 - INFO - Models have been tested in 2.50 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 542us/step


2024-07-23 03:16:38,764 - INFO - Models have been evaluated in 2.45 seconds
2024-07-23 03:19:19,905 - INFO - SHAP explanations for RandomForest created and saved
2024-07-23 03:19:21,376 - INFO - SHAP explanations for XGBoost created and saved
2024-07-23 03:19:22,741 - INFO - SHAP explanations for SVM created and saved
2024-07-23 03:19:24,102 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-23 03:19:43,511 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-23 03:20:02,962 - INFO - SHAP explanations for KNN created and saved
2024-07-23 03:20:22,446 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-23 03:20:22,862 - INFO - LIME explanation for RandomForest created and saved
2024-07-23 03:20:23,160 - INFO - LIME explanation for XGBoost created and saved
2024-07-23 03:20:25,196 - INFO - LIME explanation for SVM created and saved
2024-07-23 03:20:25,470 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.274142
Liquidity_and_Coverage_Ratios_PC1    0.205798
      Cost_and_Expense_Ratios_PC1    0.147311
Liquidity_and_Coverage_Ratios_PC2    0.081293
      Cost_and_Expense_Ratios_PC2    0.064461
         Profitability_Ratios_PC1    0.039987
             Cash_Flow_Ratios_PC2    0.031349
             Cash_Flow_Ratios_PC1    0.029869
              Activity_Ratios_PC2    0.018811
         Profitability_Ratios_PC2    0.018784

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.505651
Liquidity_and_Coverage_Ratios_PC1    0.090656
      Cost_and_Expense_Ratios_PC1    0.069397
Liquidity_and_Coverage_Ratios_PC2    0.061130
             Cash_Flow_Ratios_PC1    0.036352
             Per_Share_Ratios_PC2    0.031088
      Cost_and_Expense

2024-07-23 03:23:21,109 - INFO - ANN has been trained in 174.27 seconds
2024-07-23 03:40:55,334 - INFO - RandomForest has been trained in 1054.23 seconds
2024-07-23 03:41:08,770 - INFO - XGBoost has been trained in 13.44 seconds
2024-07-23 03:53:50,976 - INFO - SVM has been trained in 762.20 seconds
2024-07-23 03:53:51,713 - INFO - LogisticRegression has been trained in 0.74 seconds
2024-07-23 04:43:44,535 - INFO - GradientBoosting has been trained in 2992.82 seconds
2024-07-23 04:43:47,360 - INFO - KNN has been trained in 2.82 seconds
2024-07-23 04:43:47,369 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 650us/step


2024-07-23 04:43:49,701 - INFO - Models have been tested in 2.33 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 556us/step


2024-07-23 04:43:52,133 - INFO - Models have been evaluated in 2.43 seconds
2024-07-23 04:47:47,510 - INFO - SHAP explanations for RandomForest created and saved
2024-07-23 04:47:48,966 - INFO - SHAP explanations for XGBoost created and saved
2024-07-23 04:47:50,296 - INFO - SHAP explanations for SVM created and saved
2024-07-23 04:47:51,705 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-23 04:48:10,533 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-23 04:48:29,406 - INFO - SHAP explanations for KNN created and saved
2024-07-23 04:48:48,117 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-23 04:48:48,566 - INFO - LIME explanation for RandomForest created and saved
2024-07-23 04:48:48,871 - INFO - LIME explanation for XGBoost created and saved
2024-07-23 04:48:50,785 - INFO - LIME explanation for SVM created and saved
2024-07-23 04:48:51,066 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.250686
Liquidity_and_Coverage_Ratios_PC1    0.230570
      Cost_and_Expense_Ratios_PC1    0.138832
Liquidity_and_Coverage_Ratios_PC2    0.088658
      Cost_and_Expense_Ratios_PC2    0.056759
         Profitability_Ratios_PC1    0.037599
             Cash_Flow_Ratios_PC2    0.035791
             Cash_Flow_Ratios_PC1    0.029805
              Activity_Ratios_PC1    0.023575
              Activity_Ratios_PC2    0.021451

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.511008
Liquidity_and_Coverage_Ratios_PC1    0.088647
      Cost_and_Expense_Ratios_PC1    0.070526
Liquidity_and_Coverage_Ratios_PC2    0.058024
             Cash_Flow_Ratios_PC1    0.039331
             Cash_Flow_Ratios_PC2    0.036763
             Per_Share

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

Note: the above are MICE_RF datasets results