In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time
import os

from datetime import datetime

import shap
import lime
from lime import lime_tabular

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

from sklearn.cluster import KMeans

import missingno as msno

from fancyimpute import IterativeImputer as MICE
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam 


from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from collections import Counter

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

from imblearn.over_sampling import KMeansSMOTE
from sklearn.mixture import GaussianMixture


from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from joblib import dump, load
import logging


In [2]:

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def split_dataset(dataset, target_column, test_size=0.2):
    """
    Split dataset into training and testing sets.
    """
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    logging.info("Dataset has been split and returned")
    return X_train, X_test, y_train, y_test

def train_ann(X_train, y_train):
    """
    Train an Artificial Neural Network (ANN) on the training data.
    """
    start_time = time.time()
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(12, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
    end_time = time.time()

    logging.info(f"ANN has been trained in {end_time - start_time:.2f} seconds")
    return model

def train_models(X_train, y_train):
    """
    Train multiple models on the training data.
    """
    models = {}
    param_grids = {
        'RandomForest': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        },
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        'LogisticRegression': {
            'C': [0.1, 1, 10],
            'penalty': ['l2']
        },
        'GradientBoosting': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7]
        },
        'KNN': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }

    models['ANN'] = train_ann(X_train, y_train)

    for model_name, param_grid in param_grids.items():
        start_time = time.time()
        try:
            if model_name == 'RandomForest':
                model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
            elif model_name == 'XGBoost':
                model = GridSearchCV(XGBClassifier(), param_grid, cv=5)
            elif model_name == 'SVM':
                model = GridSearchCV(SVC(probability=True), param_grid, cv=5)
            elif model_name == 'LogisticRegression':
                model = GridSearchCV(LogisticRegression(), param_grid, cv=5)
            elif model_name == 'GradientBoosting':
                model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
            elif model_name == 'KNN':
                model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

            model.fit(X_train, y_train)
            models[model_name] = model.best_estimator_
            end_time = time.time()
            logging.info(f"{model_name} has been trained in {end_time - start_time:.2f} seconds")
        except Exception as e:
            logging.error(f"Error training {model_name}: {e}")

    try:
        start_time = time.time()
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        models['NaiveBayes'] = nb
        end_time = time.time()
        logging.info(f"Naive Bayes has been trained in {end_time - start_time:.2f} seconds")
    except Exception as e:
        logging.error(f"Error training Naive Bayes: {e}")

    return models

def test_models(models, X_test):
    """
    Test trained models on the test data.
    """
    start_time = time.time()
    predictions = {}
    for name, model in models.items():
        try:
            if name == 'ANN':
                predictions[name] = (model.predict(X_test) > 0.5).astype("int32")
            else:
                predictions[name] = model.predict(X_test)
        except Exception as e:
            logging.error(f"Error testing {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been tested in {end_time - start_time:.2f} seconds")
    return predictions


def evaluate_models(models, predictions, y_test, X_test):
    """
    Evaluate the performance of models.
    """
    start_time = time.time()
    metrics = {}
    for name, y_pred in predictions.items():
        try:
            accuracy = accuracy_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            auc = roc_auc_score(y_test, models[name].predict_proba(X_test)[:, 1]) if name != 'ANN' else roc_auc_score(y_test, models[name].predict(X_test))
            metrics[name] = {
                'accuracy': accuracy,
                'confusion_matrix': cm,
                'f1_score': f1,
                'precision': precision,
                'recall': recall,
                'auc_roc': auc
            }
        except Exception as e:
            logging.error(f"Error evaluating {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been evaluated in {end_time - start_time:.2f} seconds")
    return metrics


def explainability_shap(models, df_name, X_test, feature_names):

    """
    Generate SHAP graphs for each of the models
    - It indicates the contributions of variables for the prediction of each of the models
    - It shows how variabels / features affect the model performance
    
    """
    # Ensure X_test is a DataFrame with named columns
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                explainer = shap.TreeExplainer(model)
            
            # No existing methods to analyse other models using SHAP, so only these three models.
            
            shap_values = explainer.shap_values(X_test)
            
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values[1] if isinstance(shap_values, list) else shap_values, 
                              X_test, plot_type="bar", show=False, max_display=10)
            plt.title(f"Top 10 Most Important Features - {name}")
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_shap_importance_{name}.png")
            plt.close()
            logging.info(f"SHAP explanations for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating SHAP explanations for {name}: {e}")



def explainability_lime(models, df_name, X_train, X_test, feature_names):
    
    """
    Generates LIME graphs for each of the models
    - This shows the influence of features for the model in classifying the instances
    - Unlike SHAP, this also shows the direction / influence of the variables on each of the classes
    
    """
    # Ensure X_train and X_test are DataFrames with named columns
    X_train = pd.DataFrame(X_train, columns=feature_names).reset_index(drop=True)
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train.values,  # Use .values to get numpy array
        feature_names=feature_names, 
        class_names=['Negative', 'Positive'], 
        mode='classification'
    )
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            i = np.random.randint(0, X_test.shape[0])
            exp = explainer.explain_instance(
                X_test.iloc[i].values,  # Use .iloc[i].values to get numpy array
                model.predict_proba, 
                num_features=6
            )
            feature_importance = pd.DataFrame(exp.as_list(), columns=['Feature', 'Importance'])
            feature_importance['Absolute Importance'] = abs(feature_importance['Importance'])
            feature_importance = feature_importance.sort_values('Absolute Importance', ascending=True)
            plt.figure(figsize=(10, 6))
            colors = ['red' if imp < 0 else 'green' for imp in feature_importance['Importance']]
            plt.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
            plt.title(f"LIME Explanation for {name}\nTop 6 Features' Impact on Prediction")
            plt.xlabel('Impact on Prediction (Red = Negative, Green = Positive)')
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_lime_explanation_{name}.png")
            plt.close()
            logging.info(f"LIME explanation for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating LIME explanations for {name}: {e}")



def interpret_results(models, X_test, feature_names):

    """
    This shows the importance and the influence of the features in predictions of each of the models
    """
    
    summary = "Model Interpretation Summary:\n\n"
    for name, model in models.items():
        if name == 'ANN':
            continue
        summary += f"{name} Model:\n"
        summary += f"Feature Importance from {name} Model:\n"
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                importances = model.feature_importances_
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            else:
                importances = model.coef_[0] if hasattr(model, 'coef_') else None
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            summary += importance_df.to_string(index=False)
            summary += "\n\n"
        except Exception as e:
            logging.error(f"Error interpreting results for {name}: {e}")
    logging.info("Model interpretation summary created")
    return summary


def save_models(models, directory='models'):
    """
    Save trained models to disk.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)
    for name, model in models.items():
        try:
            if name == 'ANN':
                model.save(os.path.join(directory, f'{name}_model.h5'))
            else:
                dump(model, os.path.join(directory, f'{name}_model.joblib'))
            logging.info(f"{name} model saved")
        except Exception as e:
            logging.error(f"Error saving {name} model: {e}")


# Use only if needed to run back with best models
def load_models(directory='models'):
    """
    Load trained models from disk.
    """
    models = {}
    for filename in os.listdir(directory):
        model_name, ext = os.path.splitext(filename)
        try:
            if ext == '.h5':
                models[model_name] = load_model(os.path.join(directory, filename))
            elif ext == '.joblib':
                models[model_name] = load(os.path.join(directory, filename))
            logging.info(f"{model_name} model loaded")
        except Exception as e:
            logging.error(f"Error loading {model_name} model: {e}")
    return models


def main(dataset, target_column, name):
    """
    Main function to train, test, evaluate, and explain models.
    """
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    logging.info("Data has been standardized")

    models = train_models(X_train, y_train)
    predictions = test_models(models, X_test)
    metrics = evaluate_models(models, predictions, y_test, X_test)

    explainability_shap(models, name, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    explainability_lime(models, name, X_train, X_test, feature_names=dataset.drop(columns=[target_column]).columns)

    save_models(models)
    logging.info("Models have been saved")

    # Interpret results
    summary = interpret_results(models, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    print(summary)

    return metrics


def modelling_gs(df, name):
    """
    Function to run the main pipeline with the given dataset.
    """
    target_column = 'LABEL'  # Replace with your target column
    results = main(df, target_column, name)
    logging.info("Results have been documented.")
    return results

# To run the modelling function with a dataset 'df':
# results = modelling_gs(df)

In [3]:
file_paths = [
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\ADASYN_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\ADASYN_MICE_RF_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\KMSMOTE_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\KMSMOTE_MICE_RF_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\SVMSMOTE_AE_3_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\SVMSMOTE_MICE_RF_3_PCA.xlsx"
]

# Read the Excel files into dataframes
dfs = [pd.read_excel(file_path) for file_path in file_paths]

print("Datasets are read into dataframes")

tot_start_time = time.time()
start_time = time.time()
# Store results in variables
results_ADASYN_AE_3_PCA = modelling_gs(dfs[0], "ADASYN_AE_3_PCA" )
end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by ADASYN_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_ADASYN_MICE_3_PCA = modelling_gs(dfs[1], "ADASYN_MICE_RF_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by ADASYN_MICE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_KMSMOTE_AE_3_PCA = modelling_gs(dfs[2], "KMSMOTE_AE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by KMSMOTE_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_KMSMOTE_MICE_3_PCA = modelling_gs(dfs[3], "KMSMOTE_MICE_RF_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by KMSMOTE_MICE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_SVMSMOTE_AE_3_PCA = modelling_gs(dfs[4], "SVMSMOTE_AE_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by SVMSMOTE_AE_3_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_SVMSMOTE_MICE_3_PCA = modelling_gs(dfs[5], "SVMSMOTE_MICE_RF_3_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by SVMSMOTE_MICE_3_PCA: {elapsed_time:.2f} mins")


print(" ")
print("_______________________________________________________________________________")
tot_end_time = time.time()  # End timing
tot_elapsed_time = (tot_end_time - tot_start_time) / 60
print(f" Total time taken by all the models : {tot_elapsed_time:.2f} mins")

# Print the results with variable names
print("Results for ADASYN_AE_3_PCA:", results_ADASYN_AE_3_PCA)
print("Results for ADASYN_MICE_3_PCA:", results_ADASYN_MICE_3_PCA)
print("Results for KMSMOTE_AE_3_PCA:", results_KMSMOTE_AE_3_PCA)
print("Results for KMSMOTE_MICE_3_PCA:", results_KMSMOTE_MICE_3_PCA)
print("Results for SVMSMOTE_AE_3_PCA:", results_SVMSMOTE_AE_3_PCA)
print("Results for SVMSMOTE_MICE_3_PCA:", results_SVMSMOTE_MICE_3_PCA)

2024-07-17 14:38:46,181 - INFO - Dataset has been split and returned
2024-07-17 14:38:46,189 - INFO - Data has been standardized


Datasets are read into dataframes


2024-07-17 14:42:05,117 - INFO - ANN has been trained in 198.93 seconds
2024-07-17 15:01:21,385 - INFO - RandomForest has been trained in 1156.27 seconds
2024-07-17 15:01:37,186 - INFO - XGBoost has been trained in 15.80 seconds
2024-07-17 15:21:45,852 - INFO - SVM has been trained in 1208.66 seconds
2024-07-17 15:21:46,519 - INFO - LogisticRegression has been trained in 0.67 seconds
2024-07-17 16:13:41,153 - INFO - GradientBoosting has been trained in 3114.63 seconds
2024-07-17 16:13:44,085 - INFO - KNN has been trained in 2.93 seconds
2024-07-17 16:13:44,095 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step


2024-07-17 16:13:47,484 - INFO - Models have been tested in 3.39 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 525us/step


2024-07-17 16:13:50,806 - INFO - Models have been evaluated in 3.32 seconds
2024-07-17 16:15:33,458 - INFO - SHAP explanations for RandomForest created and saved
2024-07-17 16:15:34,873 - INFO - SHAP explanations for XGBoost created and saved
2024-07-17 16:15:36,217 - INFO - SHAP explanations for SVM created and saved
2024-07-17 16:15:37,559 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-17 16:15:57,601 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-17 16:16:17,580 - INFO - SHAP explanations for KNN created and saved
2024-07-17 16:16:37,611 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-17 16:16:38,018 - INFO - LIME explanation for RandomForest created and saved
2024-07-17 16:16:38,319 - INFO - LIME explanation for XGBoost created and saved
2024-07-17 16:16:41,110 - INFO - LIME explanation for SVM created and saved
2024-07-17 16:16:41,379 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.255254
Liquidity_and_Coverage_Ratios_PC1    0.190513
      Cost_and_Expense_Ratios_PC1    0.129747
      Cost_and_Expense_Ratios_PC2    0.094890
Liquidity_and_Coverage_Ratios_PC2    0.056234
         Profitability_Ratios_PC1    0.034530
              Activity_Ratios_PC1    0.030779
             Cash_Flow_Ratios_PC1    0.030629
         Profitability_Ratios_PC2    0.028792
             Cash_Flow_Ratios_PC2    0.026788

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.416653
      Cost_and_Expense_Ratios_PC1    0.090932
      Cost_and_Expense_Ratios_PC2    0.081417
Liquidity_and_Coverage_Ratios_PC2    0.051538
Liquidity_and_Coverage_Ratios_PC1    0.051497
              Activity_Ratios_PC1    0.039150
         Profitability

2024-07-17 16:19:36,872 - INFO - ANN has been trained in 174.24 seconds
2024-07-17 16:38:27,084 - INFO - RandomForest has been trained in 1130.21 seconds
2024-07-17 16:38:40,761 - INFO - XGBoost has been trained in 13.68 seconds
2024-07-17 16:58:09,063 - INFO - SVM has been trained in 1168.30 seconds
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2024-07-17 16:58:09,922 - INFO - LogisticRegression has been trained in 0.86 seconds
2024-07-17 17:48:08,553 - INFO - GradientBoosting has been trained in 2998.63 seconds
2024-07-17 17:48:11,497 - INFO - KNN has been trained in 2.94 seconds
2024-07-17 17:48:11,504 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 636us/step


2024-07-17 17:48:14,518 - INFO - Models have been tested in 3.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step


2024-07-17 17:48:17,619 - INFO - Models have been evaluated in 3.10 seconds
2024-07-17 17:50:15,900 - INFO - SHAP explanations for RandomForest created and saved
2024-07-17 17:50:17,762 - INFO - SHAP explanations for XGBoost created and saved
2024-07-17 17:50:19,193 - INFO - SHAP explanations for SVM created and saved
2024-07-17 17:50:20,597 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-17 17:50:41,217 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-17 17:51:01,852 - INFO - SHAP explanations for KNN created and saved
2024-07-17 17:51:22,421 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-17 17:51:22,797 - INFO - LIME explanation for RandomForest created and saved
2024-07-17 17:51:23,099 - INFO - LIME explanation for XGBoost created and saved
2024-07-17 17:51:25,745 - INFO - LIME explanation for SVM created and saved
2024-07-17 17:51:26,015 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
      Cost_and_Expense_Ratios_PC1    0.222406
              Leverage_Ratios_PC1    0.193117
Liquidity_and_Coverage_Ratios_PC1    0.136213
      Cost_and_Expense_Ratios_PC2    0.084318
              Leverage_Ratios_PC2    0.059743
Liquidity_and_Coverage_Ratios_PC2    0.054549
         Profitability_Ratios_PC1    0.034414
              Activity_Ratios_PC1    0.029158
              Activity_Ratios_PC2    0.027616
             Cash_Flow_Ratios_PC1    0.026494

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.434680
      Cost_and_Expense_Ratios_PC1    0.155829
Liquidity_and_Coverage_Ratios_PC2    0.058382
Liquidity_and_Coverage_Ratios_PC1    0.056347
             Cash_Flow_Ratios_PC1    0.035663
              Activity_Ratios_PC1    0.030532
             Per_Share

2024-07-17 17:54:21,671 - INFO - ANN has been trained in 174.47 seconds
2024-07-17 18:12:49,182 - INFO - RandomForest has been trained in 1107.51 seconds
2024-07-17 18:13:02,937 - INFO - XGBoost has been trained in 13.75 seconds
2024-07-17 18:27:18,355 - INFO - SVM has been trained in 855.42 seconds
2024-07-17 18:27:19,049 - INFO - LogisticRegression has been trained in 0.69 seconds
2024-07-17 19:18:01,047 - INFO - GradientBoosting has been trained in 3042.00 seconds
2024-07-17 19:18:03,948 - INFO - KNN has been trained in 2.90 seconds
2024-07-17 19:18:03,958 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 661us/step


2024-07-17 19:18:06,496 - INFO - Models have been tested in 2.54 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 548us/step


2024-07-17 19:18:09,141 - INFO - Models have been evaluated in 2.65 seconds
2024-07-17 19:19:50,562 - INFO - SHAP explanations for RandomForest created and saved
2024-07-17 19:19:52,058 - INFO - SHAP explanations for XGBoost created and saved
2024-07-17 19:19:53,443 - INFO - SHAP explanations for SVM created and saved
2024-07-17 19:19:54,987 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-17 19:20:16,827 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-17 19:20:38,694 - INFO - SHAP explanations for KNN created and saved
2024-07-17 19:21:01,746 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-17 19:21:02,160 - INFO - LIME explanation for RandomForest created and saved
2024-07-17 19:21:02,488 - INFO - LIME explanation for XGBoost created and saved
2024-07-17 19:21:04,755 - INFO - LIME explanation for SVM created and saved
2024-07-17 19:21:05,062 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.283118
Liquidity_and_Coverage_Ratios_PC1    0.187846
      Cost_and_Expense_Ratios_PC1    0.161780
      Cost_and_Expense_Ratios_PC2    0.073090
Liquidity_and_Coverage_Ratios_PC2    0.068924
         Profitability_Ratios_PC1    0.033355
             Cash_Flow_Ratios_PC2    0.032262
             Cash_Flow_Ratios_PC1    0.024272
              Activity_Ratios_PC2    0.022533
              Activity_Ratios_PC1    0.022351

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.578870
      Cost_and_Expense_Ratios_PC1    0.096509
Liquidity_and_Coverage_Ratios_PC1    0.043744
              Activity_Ratios_PC1    0.035837
Liquidity_and_Coverage_Ratios_PC2    0.034951
              Activity_Ratios_PC2    0.032172
             Cash_Flow

2024-07-17 19:24:03,842 - INFO - ANN has been trained in 177.50 seconds
2024-07-17 19:43:05,836 - INFO - RandomForest has been trained in 1141.99 seconds
2024-07-17 19:43:20,112 - INFO - XGBoost has been trained in 14.28 seconds
2024-07-17 19:58:06,883 - INFO - SVM has been trained in 886.77 seconds
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2024-07-17 19:58:07,694 - INFO - LogisticRegression has been trained in 0.81 seconds
2024-07-17 20:49:13,382 - INFO - GradientBoosting has been trained in 3065.69 seconds
2024-07-17 20:49:16,384 - INFO - KNN has been trained in 3.00 seconds
2024-07-17 20:49:16,394 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 651us/step


2024-07-17 20:49:18,980 - INFO - Models have been tested in 2.59 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step


2024-07-17 20:49:21,496 - INFO - Models have been evaluated in 2.51 seconds
2024-07-17 20:52:26,632 - INFO - SHAP explanations for RandomForest created and saved
2024-07-17 20:52:28,033 - INFO - SHAP explanations for XGBoost created and saved
2024-07-17 20:52:29,367 - INFO - SHAP explanations for SVM created and saved
2024-07-17 20:52:30,722 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-17 20:52:50,901 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-17 20:53:10,969 - INFO - SHAP explanations for KNN created and saved
2024-07-17 20:53:31,052 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-17 20:53:31,486 - INFO - LIME explanation for RandomForest created and saved
2024-07-17 20:53:31,796 - INFO - LIME explanation for XGBoost created and saved
2024-07-17 20:53:33,793 - INFO - LIME explanation for SVM created and saved
2024-07-17 20:53:34,069 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.311899
Liquidity_and_Coverage_Ratios_PC1    0.191697
      Cost_and_Expense_Ratios_PC1    0.108300
Liquidity_and_Coverage_Ratios_PC2    0.074541
      Cost_and_Expense_Ratios_PC2    0.072452
              Activity_Ratios_PC2    0.042610
         Profitability_Ratios_PC1    0.035484
             Cash_Flow_Ratios_PC2    0.028783
              Activity_Ratios_PC1    0.027260
             Cash_Flow_Ratios_PC1    0.020179

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.623866
      Cost_and_Expense_Ratios_PC1    0.082495
              Activity_Ratios_PC2    0.037848
Liquidity_and_Coverage_Ratios_PC1    0.035664
Liquidity_and_Coverage_Ratios_PC2    0.029252
              Activity_Ratios_PC1    0.026884
             Cash_Flow

2024-07-17 20:56:29,466 - INFO - ANN has been trained in 174.15 seconds
2024-07-17 21:14:08,666 - INFO - RandomForest has been trained in 1059.20 seconds
2024-07-17 21:14:22,401 - INFO - XGBoost has been trained in 13.73 seconds
2024-07-17 21:27:23,802 - INFO - SVM has been trained in 781.40 seconds
2024-07-17 21:27:24,399 - INFO - LogisticRegression has been trained in 0.60 seconds
2024-07-17 22:18:09,556 - INFO - GradientBoosting has been trained in 3045.16 seconds
2024-07-17 22:18:12,347 - INFO - KNN has been trained in 2.79 seconds
2024-07-17 22:18:12,354 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 625us/step


2024-07-17 22:18:14,730 - INFO - Models have been tested in 2.37 seconds


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 504us/step


2024-07-17 22:18:17,164 - INFO - Models have been evaluated in 2.43 seconds
2024-07-17 22:22:08,809 - INFO - SHAP explanations for RandomForest created and saved
2024-07-17 22:22:10,151 - INFO - SHAP explanations for XGBoost created and saved
2024-07-17 22:22:11,434 - INFO - SHAP explanations for SVM created and saved
2024-07-17 22:22:12,702 - INFO - SHAP explanations for LogisticRegression created and saved
2024-07-17 22:22:31,538 - INFO - SHAP explanations for GradientBoosting created and saved
2024-07-17 22:22:50,487 - INFO - SHAP explanations for KNN created and saved
2024-07-17 22:23:09,328 - INFO - SHAP explanations for NaiveBayes created and saved
2024-07-17 22:23:09,764 - INFO - LIME explanation for RandomForest created and saved
2024-07-17 22:23:10,058 - INFO - LIME explanation for XGBoost created and saved
2024-07-17 22:23:12,083 - INFO - LIME explanation for SVM created and saved
2024-07-17 22:23:12,344 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.272939
Liquidity_and_Coverage_Ratios_PC1    0.224027
      Cost_and_Expense_Ratios_PC1    0.125460
Liquidity_and_Coverage_Ratios_PC2    0.082071
      Cost_and_Expense_Ratios_PC2    0.061652
         Profitability_Ratios_PC1    0.042210
             Cash_Flow_Ratios_PC2    0.035113
             Cash_Flow_Ratios_PC1    0.031949
              Activity_Ratios_PC2    0.018981
              Activity_Ratios_PC1    0.018617

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.505651
Liquidity_and_Coverage_Ratios_PC1    0.090656
      Cost_and_Expense_Ratios_PC1    0.069397
Liquidity_and_Coverage_Ratios_PC2    0.061130
             Cash_Flow_Ratios_PC1    0.036352
             Per_Share_Ratios_PC2    0.031088
      Cost_and_Expense

2024-07-17 22:26:07,730 - INFO - ANN has been trained in 173.68 seconds
2024-07-17 22:49:07,644 - INFO - RandomForest has been trained in 1379.91 seconds
2024-07-17 22:49:26,555 - INFO - XGBoost has been trained in 18.91 seconds
