In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time
import os

from datetime import datetime

import shap
import lime
from lime import lime_tabular

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

from sklearn.cluster import KMeans

import missingno as msno

from fancyimpute import IterativeImputer as MICE
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam 


from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from collections import Counter

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

from imblearn.over_sampling import KMeansSMOTE
from sklearn.mixture import GaussianMixture


from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from joblib import dump, load
import logging


In [4]:

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def split_dataset(dataset, target_column, test_size=0.2):
    """
    Split dataset into training and testing sets.
    """
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    logging.info("Dataset has been split and returned")
    return X_train, X_test, y_train, y_test

def train_ann(X_train, y_train):
    """
    Train an Artificial Neural Network (ANN) on the training data.
    """
    start_time = time.time()
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(12, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
    end_time = time.time()

    logging.info(f"ANN has been trained in {end_time - start_time:.2f} seconds")
    return model

def train_models(X_train, y_train):
    """
    Train multiple models on the training data.
    """
    models = {}
    param_grids = {
        'RandomForest': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        },
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        'LogisticRegression': {
            'C': [0.1, 1, 10],
            'penalty': ['l2']
        },
        'GradientBoosting': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7]
        },
        'KNN': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }

    models['ANN'] = train_ann(X_train, y_train)

    for model_name, param_grid in param_grids.items():
        start_time = time.time()
        try:
            if model_name == 'RandomForest':
                model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
            elif model_name == 'XGBoost':
                model = GridSearchCV(XGBClassifier(), param_grid, cv=5)
            elif model_name == 'SVM':
                model = GridSearchCV(SVC(probability=True), param_grid, cv=5)
            elif model_name == 'LogisticRegression':
                model = GridSearchCV(LogisticRegression(), param_grid, cv=5)
            elif model_name == 'GradientBoosting':
                model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
            elif model_name == 'KNN':
                model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

            model.fit(X_train, y_train)
            models[model_name] = model.best_estimator_
            end_time = time.time()
            logging.info(f"{model_name} has been trained in {end_time - start_time:.2f} seconds")
        except Exception as e:
            logging.error(f"Error training {model_name}: {e}")

    try:
        start_time = time.time()
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        models['NaiveBayes'] = nb
        end_time = time.time()
        logging.info(f"Naive Bayes has been trained in {end_time - start_time:.2f} seconds")
    except Exception as e:
        logging.error(f"Error training Naive Bayes: {e}")

    return models

def test_models(models, X_test):
    """
    Test trained models on the test data.
    """
    start_time = time.time()
    predictions = {}
    for name, model in models.items():
        try:
            if name == 'ANN':
                predictions[name] = (model.predict(X_test) > 0.5).astype("int32")
            else:
                predictions[name] = model.predict(X_test)
        except Exception as e:
            logging.error(f"Error testing {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been tested in {end_time - start_time:.2f} seconds")
    return predictions


def evaluate_models(models, predictions, y_test, X_test):
    """
    Evaluate the performance of models.
    """
    start_time = time.time()
    metrics = {}
    for name, y_pred in predictions.items():
        try:
            accuracy = accuracy_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            auc = roc_auc_score(y_test, models[name].predict_proba(X_test)[:, 1]) if name != 'ANN' else roc_auc_score(y_test, models[name].predict(X_test))
            metrics[name] = {
                'accuracy': accuracy,
                'confusion_matrix': cm,
                'f1_score': f1,
                'precision': precision,
                'recall': recall,
                'auc_roc': auc
            }
        except Exception as e:
            logging.error(f"Error evaluating {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been evaluated in {end_time - start_time:.2f} seconds")
    return metrics


def explainability_shap(models, df_name, X_test, feature_names):

    """
    
    """
    # Ensure X_test is a DataFrame with named columns
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                explainer = shap.TreeExplainer(model)
            
            # No existing methods to analyse other models using SHAP, so only these three models.
            
            shap_values = explainer.shap_values(X_test)
            
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values[1] if isinstance(shap_values, list) else shap_values, 
                              X_test, plot_type="bar", show=False, max_display=10)
            plt.title(f"Top 10 Most Important Features - {name}")
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_shap_importance_{name}.png")
            plt.close()
            logging.info(f"SHAP explanations for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating SHAP explanations for {name}: {e}")



def explainability_lime(models, df_name, X_train, X_test, feature_names):
    
    """
    
    """
    # Ensure X_train and X_test are DataFrames with named columns
    X_train = pd.DataFrame(X_train, columns=feature_names).reset_index(drop=True)
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train.values,  # Use .values to get numpy array
        feature_names=feature_names, 
        class_names=['Negative', 'Positive'], 
        mode='classification'
    )
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            i = np.random.randint(0, X_test.shape[0])
            exp = explainer.explain_instance(
                X_test.iloc[i].values,  # Use .iloc[i].values to get numpy array
                model.predict_proba, 
                num_features=6
            )
            feature_importance = pd.DataFrame(exp.as_list(), columns=['Feature', 'Importance'])
            feature_importance['Absolute Importance'] = abs(feature_importance['Importance'])
            feature_importance = feature_importance.sort_values('Absolute Importance', ascending=True)
            plt.figure(figsize=(10, 6))
            colors = ['red' if imp < 0 else 'green' for imp in feature_importance['Importance']]
            plt.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
            plt.title(f"LIME Explanation for {name}\nTop 6 Features' Impact on Prediction")
            plt.xlabel('Impact on Prediction (Red = Negative, Green = Positive)')
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_lime_explanation_{name}.png")
            plt.close()
            logging.info(f"LIME explanation for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating LIME explanations for {name}: {e}")



def interpret_results(models, X_test, feature_names):
    summary = "Model Interpretation Summary:\n\n"
    for name, model in models.items():
        if name == 'ANN':
            continue
        summary += f"{name} Model:\n"
        summary += f"Feature Importance from {name} Model:\n"
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                importances = model.feature_importances_
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            else:
                importances = model.coef_[0] if hasattr(model, 'coef_') else None
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            summary += importance_df.to_string(index=False)
            summary += "\n\n"
        except Exception as e:
            logging.error(f"Error interpreting results for {name}: {e}")
    logging.info("Model interpretation summary created")
    return summary


def save_models(models, directory='models'):
    """
    Save trained models to disk.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)
    for name, model in models.items():
        try:
            if name == 'ANN':
                model.save(os.path.join(directory, f'{name}_model.h5'))
            else:
                dump(model, os.path.join(directory, f'{name}_model.joblib'))
            logging.info(f"{name} model saved")
        except Exception as e:
            logging.error(f"Error saving {name} model: {e}")


# Use only if needed to run back with best models
def load_models(directory='models'):
    """
    Load trained models from disk.
    """
    models = {}
    for filename in os.listdir(directory):
        model_name, ext = os.path.splitext(filename)
        try:
            if ext == '.h5':
                models[model_name] = load_model(os.path.join(directory, filename))
            elif ext == '.joblib':
                models[model_name] = load(os.path.join(directory, filename))
            logging.info(f"{model_name} model loaded")
        except Exception as e:
            logging.error(f"Error loading {model_name} model: {e}")
    return models


def main(dataset, target_column, name):
    """
    Main function to train, test, evaluate, and explain models.
    """
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    logging.info("Data has been standardized")

    models = train_models(X_train, y_train)
    predictions = test_models(models, X_test)
    metrics = evaluate_models(models, predictions, y_test, X_test)

    explainability_shap(models, name, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    explainability_lime(models, name, X_train, X_test, feature_names=dataset.drop(columns=[target_column]).columns)

    save_models(models)
    logging.info("Models have been saved")

    # Interpret results
    summary = interpret_results(models, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    print(summary)

    return metrics


def modelling_gs(df, name):
    """
    Function to run the main pipeline with the given dataset.
    """
    target_column = 'LABEL'  # Replace with your target column
    results = main(df, target_column, name)
    logging.info("Results have been documented.")
    return results

# To run the modelling function with a dataset 'df':
# results = modelling_gs(df)

In [5]:
file_paths = [
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\ADASYN_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\ADASYN_MICE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\KMSMOTE_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\KMSMOTE_MICE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\SVMSMOTE_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\SVMSMOTE_MICE_3_PCA.xlsx"
]

# Read the Excel files into dataframes
dfs = [pd.read_excel(file_path) for file_path in file_paths]

print("Datasets are read into dataframes")

tot_start_time = time.time()
start_time = time.time()
# Store results in variables
results_ADASYN_AE_3_PCA = modelling_gs(dfs[0], "ADASYN_AE_3_PCA" )
end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by ADASYN_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_ADASYN_MICE_3_PCA = modelling_gs(dfs[1], "ADASYN_MICE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by ADASYN_MICE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_KMSMOTE_AE_3_PCA = modelling_gs(dfs[2], "KMSMOTE_AE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by KMSMOTE_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_KMSMOTE_MICE_3_PCA = modelling_gs(dfs[3], "KMSMOTE_MICE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by KMSMOTE_MICE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_SVMSMOTE_AE_3_PCA = modelling_gs(dfs[4], "SVMSMOTE_AE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by SVMSMOTE_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_SVMSMOTE_MICE_3_PCA = modelling_gs(dfs[5], "SVMSMOTE_MICE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by SVMSMOTE_MICE_3_PCA: {elapsed_time:.2f} mins")


print(" ")
print("_______________________________________________________________________________")
tot_end_time = time.time()  # End timing
tot_elapsed_time = (tot_end_time - tot_start_time) / 60
print(f" Total time taken by all the models : {tot_elapsed_time:.2f} mins")

# Print the results with variable names
print("Results for ADASYN_AE_3_PCA:", results_ADASYN_AE_3_PCA)
print("Results for ADASYN_MICE_3_PCA:", results_ADASYN_MICE_3_PCA)
print("Results for KMSMOTE_AE_3_PCA:", results_KMSMOTE_AE_3_PCA)
print("Results for KMSMOTE_MICE_3_PCA:", results_KMSMOTE_MICE_3_PCA)
print("Results for SVMSMOTE_AE_3_PCA:", results_SVMSMOTE_AE_3_PCA)
print("Results for SVMSMOTE_MICE_3_PCA:", results_SVMSMOTE_MICE_3_PCA)

2024-07-11 08:31:11,199 - INFO - Dataset has been split and returned
2024-07-11 08:31:11,205 - INFO - Data has been standardized


Datasets are read into dataframes


2024-07-11 08:34:06,839 - INFO - ANN has been trained in 175.63 seconds
2024-07-11 08:53:11,627 - INFO - RandomForest has been trained in 1144.79 seconds
2024-07-11 08:53:25,576 - INFO - XGBoost has been trained in 13.95 seconds
2024-07-11 09:12:31,895 - INFO - SVM has been trained in 1146.32 seconds
2024-07-11 09:12:32,570 - INFO - LogisticRegression has been trained in 0.68 seconds
2024-07-11 10:04:09,153 - INFO - GradientBoosting has been trained in 3096.58 seconds
2024-07-11 10:04:12,375 - INFO - KNN has been trained in 3.22 seconds
2024-07-11 10:04:12,382 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 624us/step


2024-07-11 10:04:15,766 - INFO - Models have been tested in 3.38 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 496us/step


2024-07-11 10:04:19,158 - INFO - Models have been evaluated in 3.39 seconds
2024-07-11 10:10:01,674 - INFO - SHAP explanations for RandomForest created and saved
2024-07-11 10:10:03,183 - INFO - SHAP explanations for XGBoost created and saved
2024-07-11 10:10:04,567 - INFO - SHAP explanations for SVM created and saved
2024-07-11 10:10:05,978 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-11 10:10:28,596 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-11 10:10:51,199 - INFO - SHAP explanations for KNN created and saved
2024-07-11 10:11:14,685 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-11 10:11:15,338 - INFO - LIME explanation for RandomForest created and saved
2024-07-11 10:11:15,769 - INFO - LIME explanation for XGBoost created and saved
2024-07-11 10:11:19,169 - INFO - LIME explanation for SVM created and saved
2024-07-11 10:11:19,511 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.234203
      Cost_and_Expense_Ratios_PC1    0.170083
Liquidity_and_Coverage_Ratios_PC1    0.156748
      Cost_and_Expense_Ratios_PC2    0.100231
Liquidity_and_Coverage_Ratios_PC2    0.051738
              Leverage_Ratios_PC2    0.041782
         Profitability_Ratios_PC1    0.033791
             Cash_Flow_Ratios_PC1    0.029567
         Profitability_Ratios_PC2    0.026615
              Activity_Ratios_PC1    0.026388

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.429280
      Cost_and_Expense_Ratios_PC1    0.142012
Liquidity_and_Coverage_Ratios_PC1    0.055873
Liquidity_and_Coverage_Ratios_PC2    0.055459
              Activity_Ratios_PC2    0.033337
              Activity_Ratios_PC1    0.033284
             Cash_Flow

2024-07-11 10:14:19,285 - INFO - ANN has been trained in 178.20 seconds
2024-07-11 10:34:25,161 - INFO - RandomForest has been trained in 1205.88 seconds
2024-07-11 10:34:41,048 - INFO - XGBoost has been trained in 15.89 seconds
2024-07-11 10:58:19,925 - INFO - SVM has been trained in 1418.88 seconds
2024-07-11 10:58:20,618 - INFO - LogisticRegression has been trained in 0.69 seconds
2024-07-11 11:49:17,087 - INFO - GradientBoosting has been trained in 3056.47 seconds
2024-07-11 11:49:20,226 - INFO - KNN has been trained in 3.14 seconds
2024-07-11 11:49:20,235 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 682us/step


2024-07-11 11:49:24,164 - INFO - Models have been tested in 3.93 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 528us/step


2024-07-11 11:49:27,624 - INFO - Models have been evaluated in 3.46 seconds
2024-07-11 11:55:20,267 - INFO - SHAP explanations for RandomForest created and saved
2024-07-11 11:55:21,661 - INFO - SHAP explanations for XGBoost created and saved
2024-07-11 11:55:22,993 - INFO - SHAP explanations for SVM created and saved
2024-07-11 11:55:24,332 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-11 11:55:45,181 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-11 11:56:06,101 - INFO - SHAP explanations for KNN created and saved
2024-07-11 11:56:26,945 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-11 11:56:27,387 - INFO - LIME explanation for RandomForest created and saved
2024-07-11 11:56:27,704 - INFO - LIME explanation for XGBoost created and saved
2024-07-11 11:56:30,411 - INFO - LIME explanation for SVM created and saved
2024-07-11 11:56:30,673 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.236691
Liquidity_and_Coverage_Ratios_PC1    0.182210
      Cost_and_Expense_Ratios_PC1    0.171348
      Cost_and_Expense_Ratios_PC2    0.070511
Liquidity_and_Coverage_Ratios_PC2    0.053309
             Per_Share_Ratios_PC2    0.037249
         Profitability_Ratios_PC1    0.035276
              Activity_Ratios_PC1    0.031092
                Growth_Ratios_PC1    0.026876
             Cash_Flow_Ratios_PC2    0.025863

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.401111
      Cost_and_Expense_Ratios_PC1    0.139944
Liquidity_and_Coverage_Ratios_PC1    0.059314
Liquidity_and_Coverage_Ratios_PC2    0.046645
                Growth_Ratios_PC1    0.043735
             Per_Share_Ratios_PC1    0.042803
              Activity

2024-07-11 11:59:23,819 - INFO - ANN has been trained in 171.92 seconds
2024-07-11 12:17:58,999 - INFO - RandomForest has been trained in 1115.18 seconds
2024-07-11 12:18:12,724 - INFO - XGBoost has been trained in 13.72 seconds
2024-07-11 12:33:01,294 - INFO - SVM has been trained in 888.57 seconds
2024-07-11 12:33:01,994 - INFO - LogisticRegression has been trained in 0.70 seconds
2024-07-11 13:23:38,074 - INFO - GradientBoosting has been trained in 3036.08 seconds
2024-07-11 13:23:41,140 - INFO - KNN has been trained in 3.07 seconds
2024-07-11 13:23:41,147 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 881us/step


2024-07-11 13:23:43,739 - INFO - Models have been tested in 2.59 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step


2024-07-11 13:23:46,391 - INFO - Models have been evaluated in 2.65 seconds
2024-07-11 13:26:56,668 - INFO - SHAP explanations for RandomForest created and saved
2024-07-11 13:26:58,125 - INFO - SHAP explanations for XGBoost created and saved
2024-07-11 13:26:59,509 - INFO - SHAP explanations for SVM created and saved
2024-07-11 13:27:00,886 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-11 13:27:21,886 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-11 13:27:42,893 - INFO - SHAP explanations for KNN created and saved
2024-07-11 13:28:03,824 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-11 13:28:04,239 - INFO - LIME explanation for RandomForest created and saved
2024-07-11 13:28:04,538 - INFO - LIME explanation for XGBoost created and saved
2024-07-11 13:28:06,698 - INFO - LIME explanation for SVM created and saved
2024-07-11 13:28:07,156 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.307824
Liquidity_and_Coverage_Ratios_PC1    0.183045
      Cost_and_Expense_Ratios_PC1    0.132522
      Cost_and_Expense_Ratios_PC2    0.075325
Liquidity_and_Coverage_Ratios_PC2    0.061870
         Profitability_Ratios_PC1    0.035834
             Cash_Flow_Ratios_PC2    0.033610
             Cash_Flow_Ratios_PC1    0.027903
              Activity_Ratios_PC2    0.026189
             Per_Share_Ratios_PC1    0.023271

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.606076
      Cost_and_Expense_Ratios_PC1    0.075337
Liquidity_and_Coverage_Ratios_PC1    0.039747
             Cash_Flow_Ratios_PC2    0.032893
              Activity_Ratios_PC2    0.032756
Liquidity_and_Coverage_Ratios_PC2    0.032008
              Activity

2024-07-11 13:31:05,462 - INFO - ANN has been trained in 177.10 seconds
2024-07-11 13:50:36,387 - INFO - RandomForest has been trained in 1170.92 seconds
2024-07-11 13:50:50,082 - INFO - XGBoost has been trained in 13.69 seconds
2024-07-11 14:06:06,416 - INFO - SVM has been trained in 916.33 seconds
2024-07-11 14:06:07,081 - INFO - LogisticRegression has been trained in 0.66 seconds
2024-07-11 14:57:28,303 - INFO - GradientBoosting has been trained in 3081.22 seconds
2024-07-11 14:57:31,468 - INFO - KNN has been trained in 3.16 seconds
2024-07-11 14:57:31,479 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 723us/step


2024-07-11 14:57:34,237 - INFO - Models have been tested in 2.76 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 513us/step


2024-07-11 14:57:36,949 - INFO - Models have been evaluated in 2.71 seconds
2024-07-11 15:02:28,396 - INFO - SHAP explanations for RandomForest created and saved
2024-07-11 15:02:29,777 - INFO - SHAP explanations for XGBoost created and saved
2024-07-11 15:02:31,095 - INFO - SHAP explanations for SVM created and saved
2024-07-11 15:02:32,418 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-11 15:02:54,163 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-11 15:03:15,874 - INFO - SHAP explanations for KNN created and saved
2024-07-11 15:03:37,514 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-11 15:03:37,961 - INFO - LIME explanation for RandomForest created and saved
2024-07-11 15:03:38,266 - INFO - LIME explanation for XGBoost created and saved
2024-07-11 15:03:40,400 - INFO - LIME explanation for SVM created and saved
2024-07-11 15:03:40,671 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.287167
Liquidity_and_Coverage_Ratios_PC1    0.205343
      Cost_and_Expense_Ratios_PC1    0.139605
Liquidity_and_Coverage_Ratios_PC2    0.061355
      Cost_and_Expense_Ratios_PC2    0.060887
             Per_Share_Ratios_PC2    0.035925
         Profitability_Ratios_PC1    0.035466
             Cash_Flow_Ratios_PC2    0.033167
              Activity_Ratios_PC1    0.025172
             Cash_Flow_Ratios_PC1    0.021792

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.604689
      Cost_and_Expense_Ratios_PC1    0.093743
Liquidity_and_Coverage_Ratios_PC1    0.035761
             Per_Share_Ratios_PC1    0.031384
             Cash_Flow_Ratios_PC2    0.030782
Liquidity_and_Coverage_Ratios_PC2    0.028183
             Per_Share

2024-07-11 15:06:34,493 - INFO - ANN has been trained in 172.55 seconds
2024-07-11 15:23:55,027 - INFO - RandomForest has been trained in 1040.53 seconds
2024-07-11 15:24:08,573 - INFO - XGBoost has been trained in 13.54 seconds
2024-07-11 15:37:29,695 - INFO - SVM has been trained in 801.12 seconds
2024-07-11 15:37:30,340 - INFO - LogisticRegression has been trained in 0.64 seconds
2024-07-11 16:28:03,706 - INFO - GradientBoosting has been trained in 3033.36 seconds
2024-07-11 16:28:06,836 - INFO - KNN has been trained in 3.13 seconds
2024-07-11 16:28:06,844 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 618us/step


2024-07-11 16:28:09,236 - INFO - Models have been tested in 2.39 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step


2024-07-11 16:28:11,819 - INFO - Models have been evaluated in 2.58 seconds
2024-07-11 16:29:31,045 - INFO - SHAP explanations for RandomForest created and saved
2024-07-11 16:29:32,408 - INFO - SHAP explanations for XGBoost created and saved
2024-07-11 16:29:33,720 - INFO - SHAP explanations for SVM created and saved
2024-07-11 16:29:35,037 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-11 16:29:55,617 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-11 16:30:16,003 - INFO - SHAP explanations for KNN created and saved
2024-07-11 16:30:36,376 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-11 16:30:36,741 - INFO - LIME explanation for RandomForest created and saved
2024-07-11 16:30:37,043 - INFO - LIME explanation for XGBoost created and saved
2024-07-11 16:30:39,509 - INFO - LIME explanation for SVM created and saved
2024-07-11 16:30:39,770 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.256740
Liquidity_and_Coverage_Ratios_PC1    0.239777
      Cost_and_Expense_Ratios_PC1    0.111281
Liquidity_and_Coverage_Ratios_PC2    0.084551
      Cost_and_Expense_Ratios_PC2    0.071137
             Cash_Flow_Ratios_PC2    0.037640
             Cash_Flow_Ratios_PC1    0.036886
         Profitability_Ratios_PC1    0.036248
              Activity_Ratios_PC1    0.020348
              Activity_Ratios_PC2    0.020314

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.523971
Liquidity_and_Coverage_Ratios_PC1    0.079233
      Cost_and_Expense_Ratios_PC1    0.074354
Liquidity_and_Coverage_Ratios_PC2    0.045701
             Cash_Flow_Ratios_PC1    0.039600
             Cash_Flow_Ratios_PC2    0.033878
      Cost_and_Expense

2024-07-11 16:33:33,160 - INFO - ANN has been trained in 172.25 seconds
2024-07-11 16:50:54,337 - INFO - RandomForest has been trained in 1041.18 seconds
2024-07-11 16:51:07,548 - INFO - XGBoost has been trained in 13.21 seconds
2024-07-11 17:04:32,283 - INFO - SVM has been trained in 804.73 seconds
2024-07-11 17:04:33,091 - INFO - LogisticRegression has been trained in 0.81 seconds
2024-07-11 17:54:30,627 - INFO - GradientBoosting has been trained in 2997.53 seconds
2024-07-11 17:54:33,744 - INFO - KNN has been trained in 3.12 seconds
2024-07-11 17:54:33,753 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 659us/step


2024-07-11 17:54:36,206 - INFO - Models have been tested in 2.45 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 502us/step


2024-07-11 17:54:38,736 - INFO - Models have been evaluated in 2.53 seconds
2024-07-11 17:56:01,491 - INFO - SHAP explanations for RandomForest created and saved
2024-07-11 17:56:02,823 - INFO - SHAP explanations for XGBoost created and saved
2024-07-11 17:56:04,094 - INFO - SHAP explanations for SVM created and saved
2024-07-11 17:56:05,346 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-11 17:56:25,363 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-11 17:56:45,483 - INFO - SHAP explanations for KNN created and saved
2024-07-11 17:57:05,563 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-11 17:57:05,942 - INFO - LIME explanation for RandomForest created and saved
2024-07-11 17:57:06,242 - INFO - LIME explanation for XGBoost created and saved
2024-07-11 17:57:08,228 - INFO - LIME explanation for SVM created and saved
2024-07-11 17:57:08,492 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.275064
Liquidity_and_Coverage_Ratios_PC1    0.242968
      Cost_and_Expense_Ratios_PC1    0.125846
Liquidity_and_Coverage_Ratios_PC2    0.078226
      Cost_and_Expense_Ratios_PC2    0.047307
         Profitability_Ratios_PC1    0.037334
             Cash_Flow_Ratios_PC1    0.033399
             Cash_Flow_Ratios_PC2    0.030279
              Activity_Ratios_PC2    0.023490
             Per_Share_Ratios_PC2    0.020633

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.515313
Liquidity_and_Coverage_Ratios_PC1    0.094884
      Cost_and_Expense_Ratios_PC1    0.071955
Liquidity_and_Coverage_Ratios_PC2    0.053566
             Cash_Flow_Ratios_PC1    0.041987
             Per_Share_Ratios_PC2    0.033123
             Cash_Flow

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>