In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time
import os

from datetime import datetime

import shap
import lime
from lime import lime_tabular

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

from sklearn.cluster import KMeans

import missingno as msno

from fancyimpute import IterativeImputer as MICE
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam 


from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from collections import Counter

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

from imblearn.over_sampling import KMeansSMOTE
from sklearn.mixture import GaussianMixture


from xgboost import XGBClassifier
from rgf.sklearn import RGFClassifier  # Regularized Greedy Forest
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from joblib import dump, load
import logging


In [2]:

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def split_dataset(dataset, target_column, test_size=0.2):
    """
    Split dataset into training and testing sets.
    """
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    logging.info("Dataset has been split and returned")
    return X_train, X_test, y_train, y_test

def train_ann(X_train, y_train):
    """
    Train an Artificial Neural Network (ANN) on the training data.
    """
    start_time = time.time()
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(12, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
    end_time = time.time()

    logging.info(f"ANN has been trained in {end_time - start_time:.2f} seconds")
    return model

def train_models(X_train, y_train):
    """
    Train multiple models on the training data.
    """
    models = {}
    param_grids = {
        'RandomForest': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        },
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        'LogisticRegression': {
            'C': [0.1, 1, 10],
            'penalty': ['l2']
        },
        'GradientBoosting': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7]
        },
        'KNN': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }

    models['ANN'] = train_ann(X_train, y_train)

    for model_name, param_grid in param_grids.items():
        start_time = time.time()
        try:
            if model_name == 'RandomForest':
                model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
            elif model_name == 'XGBoost':
                model = GridSearchCV(XGBClassifier(), param_grid, cv=5)
            elif model_name == 'SVM':
                model = GridSearchCV(SVC(probability=True), param_grid, cv=5)
            elif model_name == 'LogisticRegression':
                model = GridSearchCV(LogisticRegression(), param_grid, cv=5)
            elif model_name == 'GradientBoosting':
                model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
            elif model_name == 'KNN':
                model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

            model.fit(X_train, y_train)
            models[model_name] = model.best_estimator_
            end_time = time.time()
            logging.info(f"{model_name} has been trained in {end_time - start_time:.2f} seconds")
        except Exception as e:
            logging.error(f"Error training {model_name}: {e}")

    try:
        start_time = time.time()
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        models['NaiveBayes'] = nb
        end_time = time.time()
        logging.info(f"Naive Bayes has been trained in {end_time - start_time:.2f} seconds")
    except Exception as e:
        logging.error(f"Error training Naive Bayes: {e}")

    return models

def test_models(models, X_test):
    """
    Test trained models on the test data.
    """
    start_time = time.time()
    predictions = {}
    for name, model in models.items():
        try:
            if name == 'ANN':
                predictions[name] = (model.predict(X_test) > 0.5).astype("int32")
            else:
                predictions[name] = model.predict(X_test)
        except Exception as e:
            logging.error(f"Error testing {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been tested in {end_time - start_time:.2f} seconds")
    return predictions

def evaluate_models(models, predictions, y_test, X_test):
    """
    Evaluate the performance of models.
    """
    start_time = time.time()
    metrics = {}
    for name, y_pred in predictions.items():
        try:
            accuracy = accuracy_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, models[name].predict_proba(X_test)[:, 1]) if name != 'ANN' else roc_auc_score(y_test, models[name].predict(X_test))
            metrics[name] = {
                'accuracy': accuracy,
                'confusion_matrix': cm,
                'f1_score': f1,
                'auc_roc': auc
            }
        except Exception as e:
            logging.error(f"Error evaluating {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been evaluated in {end_time - start_time:.2f} seconds")
    return metrics


def explainability_shap(models, df_name, X_test, feature_names):

    """
    
    """
    # Ensure X_test is a DataFrame with named columns
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                explainer = shap.TreeExplainer(model)
            
            # No existing methods to analyse other models using SHAP, so only these three models.
            
            shap_values = explainer.shap_values(X_test)
            
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values[1] if isinstance(shap_values, list) else shap_values, 
                              X_test, plot_type="bar", show=False, max_display=10)
            plt.title(f"Top 10 Most Important Features - {name}")
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_shap_importance_{name}.png")
            plt.close()
            logging.info(f"SHAP explanations for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating SHAP explanations for {name}: {e}")



def explainability_lime(models, df_name, X_train, X_test, feature_names):
    
    """
    
    """
    # Ensure X_train and X_test are DataFrames with named columns
    X_train = pd.DataFrame(X_train, columns=feature_names).reset_index(drop=True)
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train.values,  # Use .values to get numpy array
        feature_names=feature_names, 
        class_names=['Negative', 'Positive'], 
        mode='classification'
    )
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            i = np.random.randint(0, X_test.shape[0])
            exp = explainer.explain_instance(
                X_test.iloc[i].values,  # Use .iloc[i].values to get numpy array
                model.predict_proba, 
                num_features=6
            )
            feature_importance = pd.DataFrame(exp.as_list(), columns=['Feature', 'Importance'])
            feature_importance['Absolute Importance'] = abs(feature_importance['Importance'])
            feature_importance = feature_importance.sort_values('Absolute Importance', ascending=True)
            plt.figure(figsize=(10, 6))
            colors = ['red' if imp < 0 else 'green' for imp in feature_importance['Importance']]
            plt.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
            plt.title(f"LIME Explanation for {name}\nTop 6 Features' Impact on Prediction")
            plt.xlabel('Impact on Prediction (Red = Negative, Green = Positive)')
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_lime_explanation_{name}.png")
            plt.close()
            logging.info(f"LIME explanation for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating LIME explanations for {name}: {e}")



def interpret_results(models, X_test, feature_names):
    summary = "Model Interpretation Summary:\n\n"
    for name, model in models.items():
        if name == 'ANN':
            continue
        summary += f"{name} Model:\n"
        summary += f"Feature Importance from {name} Model:\n"
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                importances = model.feature_importances_
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            else:
                importances = model.coef_[0] if hasattr(model, 'coef_') else None
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            summary += importance_df.to_string(index=False)
            summary += "\n\n"
        except Exception as e:
            logging.error(f"Error interpreting results for {name}: {e}")
    logging.info("Model interpretation summary created")
    return summary


def save_models(models, directory='models'):
    """
    Save trained models to disk.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)
    for name, model in models.items():
        try:
            if name == 'ANN':
                model.save(os.path.join(directory, f'{name}_model.h5'))
            else:
                dump(model, os.path.join(directory, f'{name}_model.joblib'))
            logging.info(f"{name} model saved")
        except Exception as e:
            logging.error(f"Error saving {name} model: {e}")


# Use only if needed to run back with best models
def load_models(directory='models'):
    """
    Load trained models from disk.
    """
    models = {}
    for filename in os.listdir(directory):
        model_name, ext = os.path.splitext(filename)
        try:
            if ext == '.h5':
                models[model_name] = load_model(os.path.join(directory, filename))
            elif ext == '.joblib':
                models[model_name] = load(os.path.join(directory, filename))
            logging.info(f"{model_name} model loaded")
        except Exception as e:
            logging.error(f"Error loading {model_name} model: {e}")
    return models


def main(dataset, target_column, name):
    """
    Main function to train, test, evaluate, and explain models.
    """
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    logging.info("Data has been standardized")

    models = train_models(X_train, y_train)
    predictions = test_models(models, X_test)
    metrics = evaluate_models(models, predictions, y_test, X_test)

    explainability_shap(models, name, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    explainability_lime(models, name, X_train, X_test, feature_names=dataset.drop(columns=[target_column]).columns)

    save_models(models)
    logging.info("Models have been saved")

    # Interpret results
    summary = interpret_results(models, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    print(summary)

    return metrics


def modelling_gs(df, name):
    """
    Function to run the main pipeline with the given dataset.
    """
    target_column = 'LABEL'  # Replace with your target column
    results = main(df, target_column, name)
    logging.info("Results have been documented.")
    return results

# To run the modelling function with a dataset 'df':
# results = modelling_gs(df)

In [3]:
file_paths = [
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\ADASYN_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\ADASYN_MICE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\KMSMOTE_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\KMSMOTE_MICE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\SVMSMOTE_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\SVMSMOTE_MICE_3_PCA.xlsx"
]

# Read the Excel files into dataframes
dfs = [pd.read_excel(file_path) for file_path in file_paths]

print("Datasets are read into dataframes")

tot_start_time = time.time()
start_time = time.time()
# Store results in variables
results_ADASYN_AE_3_PCA = modelling_gs(dfs[0], "ADASYN_AE_3_PCA" )
end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by ADASYN_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_ADASYN_MICE_3_PCA = modelling_gs(dfs[1], "ADASYN_MICE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by ADASYN_MICE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_KMSMOTE_AE_3_PCA = modelling_gs(dfs[2], "KMSMOTE_AE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by KMSMOTE_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_KMSMOTE_MICE_3_PCA = modelling_gs(dfs[3], "KMSMOTE_MICE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by KMSMOTE_MICE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_SVMSMOTE_AE_3_PCA = modelling_gs(dfs[4], "SVMSMOTE_AE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by SVMSMOTE_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_SVMSMOTE_MICE_3_PCA = modelling_gs(dfs[5], "SVMSMOTE_MICE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by SVMSMOTE_MICE_3_PCA: {elapsed_time:.2f} mins")


print(" ")
print("_______________________________________________________________________________")
tot_end_time = time.time()  # End timing
tot_elapsed_time = (tot_end_time - tot_start_time) / 60
print(f" Total time taken by all the models : {tot_elapsed_time:.2f} mins")

# Print the results with variable names
print("Results for ADASYN_AE_3_PCA:", results_ADASYN_AE_3_PCA)
print("Results for ADASYN_MICE_3_PCA:", results_ADASYN_MICE_3_PCA)
print("Results for KMSMOTE_AE_3_PCA:", results_KMSMOTE_AE_3_PCA)
print("Results for KMSMOTE_MICE_3_PCA:", results_KMSMOTE_MICE_3_PCA)
print("Results for SVMSMOTE_AE_3_PCA:", results_SVMSMOTE_AE_3_PCA)
print("Results for SVMSMOTE_MICE_3_PCA:", results_SVMSMOTE_MICE_3_PCA)

2024-07-10 07:50:50,799 - INFO - Dataset has been split and returned
2024-07-10 07:50:50,808 - INFO - Data has been standardized


Datasets are read into dataframes


2024-07-10 07:55:38,244 - INFO - ANN has been trained in 287.44 seconds
2024-07-10 08:22:56,567 - INFO - RandomForest has been trained in 1638.32 seconds
2024-07-10 08:23:16,786 - INFO - XGBoost has been trained in 20.22 seconds
2024-07-10 08:43:42,409 - INFO - SVM has been trained in 1225.62 seconds
2024-07-10 08:43:43,109 - INFO - LogisticRegression has been trained in 0.70 seconds
2024-07-10 09:34:17,529 - INFO - GradientBoosting has been trained in 3034.42 seconds
2024-07-10 09:34:20,656 - INFO - KNN has been trained in 3.13 seconds
2024-07-10 09:34:20,664 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 700us/step


2024-07-10 09:34:24,097 - INFO - Models have been tested in 3.43 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 519us/step


2024-07-10 09:34:27,274 - INFO - Models have been evaluated in 3.18 seconds
2024-07-10 09:36:22,453 - INFO - SHAP explanations for RandomForest created and saved
2024-07-10 09:36:23,903 - INFO - SHAP explanations for XGBoost created and saved
2024-07-10 09:36:25,245 - INFO - SHAP explanations for SVM created and saved
2024-07-10 09:36:26,582 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-10 09:36:49,104 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-10 09:37:11,822 - INFO - SHAP explanations for KNN created and saved
2024-07-10 09:37:34,887 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-10 09:37:35,301 - INFO - LIME explanation for RandomForest created and saved
2024-07-10 09:37:35,615 - INFO - LIME explanation for XGBoost created and saved
2024-07-10 09:37:38,474 - INFO - LIME explanation for SVM created and saved
2024-07-10 09:37:38,756 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.252600
      Cost_and_Expense_Ratios_PC1    0.171016
Liquidity_and_Coverage_Ratios_PC1    0.169842
      Cost_and_Expense_Ratios_PC2    0.077200
Liquidity_and_Coverage_Ratios_PC2    0.050963
              Leverage_Ratios_PC2    0.035417
         Profitability_Ratios_PC1    0.033553
             Cash_Flow_Ratios_PC1    0.028474
         Profitability_Ratios_PC2    0.027393
              Activity_Ratios_PC2    0.026083

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.429280
      Cost_and_Expense_Ratios_PC1    0.142012
Liquidity_and_Coverage_Ratios_PC1    0.055873
Liquidity_and_Coverage_Ratios_PC2    0.055459
              Activity_Ratios_PC2    0.033337
              Activity_Ratios_PC1    0.033284
             Cash_Flow

2024-07-10 09:40:45,859 - INFO - ANN has been trained in 185.79 seconds
2024-07-10 10:00:57,735 - INFO - RandomForest has been trained in 1211.88 seconds
2024-07-10 10:01:13,907 - INFO - XGBoost has been trained in 16.17 seconds
2024-07-10 10:23:59,269 - INFO - SVM has been trained in 1365.36 seconds
2024-07-10 10:24:00,032 - INFO - LogisticRegression has been trained in 0.76 seconds
2024-07-10 11:17:09,697 - INFO - GradientBoosting has been trained in 3189.66 seconds
2024-07-10 11:17:12,948 - INFO - KNN has been trained in 3.25 seconds
2024-07-10 11:17:12,957 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 651us/step


2024-07-10 11:17:16,521 - INFO - Models have been tested in 3.56 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 544us/step


2024-07-10 11:17:20,039 - INFO - Models have been evaluated in 3.52 seconds
2024-07-10 11:21:21,214 - INFO - SHAP explanations for RandomForest created and saved
2024-07-10 11:21:23,189 - INFO - SHAP explanations for XGBoost created and saved
2024-07-10 11:21:24,554 - INFO - SHAP explanations for SVM created and saved
2024-07-10 11:21:25,937 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-10 11:21:47,193 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-10 11:22:09,529 - INFO - SHAP explanations for KNN created and saved
2024-07-10 11:22:31,446 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-10 11:22:31,853 - INFO - LIME explanation for RandomForest created and saved
2024-07-10 11:22:32,161 - INFO - LIME explanation for XGBoost created and saved
2024-07-10 11:22:34,956 - INFO - LIME explanation for SVM created and saved
2024-07-10 11:22:35,218 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.252688
      Cost_and_Expense_Ratios_PC1    0.170858
Liquidity_and_Coverage_Ratios_PC1    0.166247
      Cost_and_Expense_Ratios_PC2    0.079246
Liquidity_and_Coverage_Ratios_PC2    0.051848
             Per_Share_Ratios_PC2    0.036114
         Profitability_Ratios_PC1    0.035755
              Activity_Ratios_PC1    0.030111
                Growth_Ratios_PC1    0.026349
             Cash_Flow_Ratios_PC2    0.025510

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.401111
      Cost_and_Expense_Ratios_PC1    0.139944
Liquidity_and_Coverage_Ratios_PC1    0.059314
Liquidity_and_Coverage_Ratios_PC2    0.046645
                Growth_Ratios_PC1    0.043735
             Per_Share_Ratios_PC1    0.042803
              Activity

2024-07-10 11:25:34,134 - INFO - ANN has been trained in 177.77 seconds
2024-07-10 11:44:41,020 - INFO - RandomForest has been trained in 1146.88 seconds
2024-07-10 11:44:56,939 - INFO - XGBoost has been trained in 15.92 seconds
2024-07-10 11:59:54,684 - INFO - SVM has been trained in 897.74 seconds
2024-07-10 11:59:55,374 - INFO - LogisticRegression has been trained in 0.69 seconds
2024-07-10 12:50:22,639 - INFO - GradientBoosting has been trained in 3027.27 seconds
2024-07-10 12:50:25,756 - INFO - KNN has been trained in 3.12 seconds
2024-07-10 12:50:25,765 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 647us/step


2024-07-10 12:50:28,395 - INFO - Models have been tested in 2.63 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 525us/step


2024-07-10 12:50:30,949 - INFO - Models have been evaluated in 2.55 seconds
2024-07-10 12:53:41,108 - INFO - SHAP explanations for RandomForest created and saved
2024-07-10 12:53:42,558 - INFO - SHAP explanations for XGBoost created and saved
2024-07-10 12:53:43,922 - INFO - SHAP explanations for SVM created and saved
2024-07-10 12:53:45,251 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-10 12:54:06,451 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-10 12:54:27,656 - INFO - SHAP explanations for KNN created and saved
2024-07-10 12:54:48,801 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-10 12:54:49,216 - INFO - LIME explanation for RandomForest created and saved
2024-07-10 12:54:49,520 - INFO - LIME explanation for XGBoost created and saved
2024-07-10 12:54:51,601 - INFO - LIME explanation for SVM created and saved
2024-07-10 12:54:51,874 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.324666
Liquidity_and_Coverage_Ratios_PC1    0.185440
      Cost_and_Expense_Ratios_PC1    0.125207
Liquidity_and_Coverage_Ratios_PC2    0.074478
      Cost_and_Expense_Ratios_PC2    0.057918
         Profitability_Ratios_PC1    0.036823
             Cash_Flow_Ratios_PC2    0.034836
             Cash_Flow_Ratios_PC1    0.025570
             Per_Share_Ratios_PC1    0.021765
         Profitability_Ratios_PC2    0.021655

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.606076
      Cost_and_Expense_Ratios_PC1    0.075337
Liquidity_and_Coverage_Ratios_PC1    0.039747
             Cash_Flow_Ratios_PC2    0.032893
              Activity_Ratios_PC2    0.032756
Liquidity_and_Coverage_Ratios_PC2    0.032008
              Activity

2024-07-10 12:57:47,792 - INFO - ANN has been trained in 174.46 seconds
2024-07-10 13:16:57,593 - INFO - RandomForest has been trained in 1149.80 seconds
2024-07-10 13:17:11,201 - INFO - XGBoost has been trained in 13.61 seconds
2024-07-10 13:32:07,988 - INFO - SVM has been trained in 896.79 seconds
2024-07-10 13:32:08,647 - INFO - LogisticRegression has been trained in 0.66 seconds
2024-07-10 14:22:35,167 - INFO - GradientBoosting has been trained in 3026.52 seconds
2024-07-10 14:22:38,291 - INFO - KNN has been trained in 3.12 seconds
2024-07-10 14:22:38,298 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 677us/step


2024-07-10 14:22:40,942 - INFO - Models have been tested in 2.64 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 542us/step


2024-07-10 14:22:43,510 - INFO - Models have been evaluated in 2.57 seconds
2024-07-10 14:26:00,188 - INFO - SHAP explanations for RandomForest created and saved
2024-07-10 14:26:01,571 - INFO - SHAP explanations for XGBoost created and saved
2024-07-10 14:26:02,885 - INFO - SHAP explanations for SVM created and saved
2024-07-10 14:26:04,193 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-10 14:26:25,919 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-10 14:26:47,457 - INFO - SHAP explanations for KNN created and saved
2024-07-10 14:27:09,135 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-10 14:27:09,546 - INFO - LIME explanation for RandomForest created and saved
2024-07-10 14:27:09,852 - INFO - LIME explanation for XGBoost created and saved
2024-07-10 14:27:12,024 - INFO - LIME explanation for SVM created and saved
2024-07-10 14:27:12,294 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.288058
Liquidity_and_Coverage_Ratios_PC1    0.197665
      Cost_and_Expense_Ratios_PC1    0.150055
Liquidity_and_Coverage_Ratios_PC2    0.061691
      Cost_and_Expense_Ratios_PC2    0.060221
             Per_Share_Ratios_PC2    0.036808
             Cash_Flow_Ratios_PC2    0.032950
         Profitability_Ratios_PC1    0.030599
              Activity_Ratios_PC1    0.026453
              Activity_Ratios_PC2    0.022211

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.604689
      Cost_and_Expense_Ratios_PC1    0.093743
Liquidity_and_Coverage_Ratios_PC1    0.035761
             Per_Share_Ratios_PC1    0.031384
             Cash_Flow_Ratios_PC2    0.030782
Liquidity_and_Coverage_Ratios_PC2    0.028183
             Per_Share

2024-07-10 14:30:10,964 - INFO - ANN has been trained in 177.45 seconds
2024-07-10 14:47:43,861 - INFO - RandomForest has been trained in 1052.90 seconds
2024-07-10 14:47:57,799 - INFO - XGBoost has been trained in 13.94 seconds
2024-07-10 15:01:23,834 - INFO - SVM has been trained in 806.03 seconds
2024-07-10 15:01:24,497 - INFO - LogisticRegression has been trained in 0.66 seconds
2024-07-10 15:52:01,697 - INFO - GradientBoosting has been trained in 3037.20 seconds
2024-07-10 15:52:04,817 - INFO - KNN has been trained in 3.12 seconds
2024-07-10 15:52:04,825 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 642us/step


2024-07-10 15:52:07,360 - INFO - Models have been tested in 2.53 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 604us/step


2024-07-10 15:52:09,814 - INFO - Models have been evaluated in 2.45 seconds
2024-07-10 15:54:50,463 - INFO - SHAP explanations for RandomForest created and saved
2024-07-10 15:54:51,834 - INFO - SHAP explanations for XGBoost created and saved
2024-07-10 15:54:53,131 - INFO - SHAP explanations for SVM created and saved
2024-07-10 15:54:54,410 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-10 15:55:15,046 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-10 15:55:35,467 - INFO - SHAP explanations for KNN created and saved
2024-07-10 15:55:55,856 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-10 15:55:56,267 - INFO - LIME explanation for RandomForest created and saved
2024-07-10 15:55:56,573 - INFO - LIME explanation for XGBoost created and saved
2024-07-10 15:55:58,617 - INFO - LIME explanation for SVM created and saved
2024-07-10 15:55:58,884 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.274345
Liquidity_and_Coverage_Ratios_PC1    0.210996
      Cost_and_Expense_Ratios_PC1    0.122159
Liquidity_and_Coverage_Ratios_PC2    0.089381
      Cost_and_Expense_Ratios_PC2    0.062731
         Profitability_Ratios_PC1    0.042956
             Cash_Flow_Ratios_PC2    0.040014
             Cash_Flow_Ratios_PC1    0.029972
              Activity_Ratios_PC1    0.021367
              Activity_Ratios_PC2    0.021063

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.523971
Liquidity_and_Coverage_Ratios_PC1    0.079233
      Cost_and_Expense_Ratios_PC1    0.074354
Liquidity_and_Coverage_Ratios_PC2    0.045701
             Cash_Flow_Ratios_PC1    0.039600
             Cash_Flow_Ratios_PC2    0.033878
      Cost_and_Expense

2024-07-10 15:58:55,457 - INFO - ANN has been trained in 175.37 seconds
2024-07-10 16:16:25,649 - INFO - RandomForest has been trained in 1050.19 seconds
2024-07-10 16:16:39,513 - INFO - XGBoost has been trained in 13.86 seconds
2024-07-10 16:30:08,632 - INFO - SVM has been trained in 809.12 seconds
2024-07-10 16:30:09,455 - INFO - LogisticRegression has been trained in 0.82 seconds
2024-07-10 17:23:08,900 - INFO - GradientBoosting has been trained in 3179.44 seconds
2024-07-10 17:23:12,168 - INFO - KNN has been trained in 3.26 seconds
2024-07-10 17:23:12,179 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 700us/step


2024-07-10 17:23:14,814 - INFO - Models have been tested in 2.63 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 533us/step


2024-07-10 17:23:17,400 - INFO - Models have been evaluated in 2.58 seconds
2024-07-10 17:24:42,165 - INFO - SHAP explanations for RandomForest created and saved
2024-07-10 17:24:43,569 - INFO - SHAP explanations for XGBoost created and saved
2024-07-10 17:24:44,848 - INFO - SHAP explanations for SVM created and saved
2024-07-10 17:24:46,156 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-10 17:25:06,802 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-10 17:25:27,402 - INFO - SHAP explanations for KNN created and saved
2024-07-10 17:25:48,236 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-10 17:25:48,633 - INFO - LIME explanation for RandomForest created and saved
2024-07-10 17:25:48,940 - INFO - LIME explanation for XGBoost created and saved
2024-07-10 17:25:50,979 - INFO - LIME explanation for SVM created and saved
2024-07-10 17:25:51,260 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.303832
Liquidity_and_Coverage_Ratios_PC1    0.207459
      Cost_and_Expense_Ratios_PC1    0.119858
Liquidity_and_Coverage_Ratios_PC2    0.091779
      Cost_and_Expense_Ratios_PC2    0.042357
         Profitability_Ratios_PC1    0.037489
             Cash_Flow_Ratios_PC2    0.035461
             Cash_Flow_Ratios_PC1    0.028253
             Per_Share_Ratios_PC2    0.023071
              Activity_Ratios_PC1    0.022960

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.515313
Liquidity_and_Coverage_Ratios_PC1    0.094884
      Cost_and_Expense_Ratios_PC1    0.071955
Liquidity_and_Coverage_Ratios_PC2    0.053566
             Cash_Flow_Ratios_PC1    0.041987
             Per_Share_Ratios_PC2    0.033123
             Cash_Flow

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>