In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time
import os

from datetime import datetime

import shap
import lime
from lime import lime_tabular

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import metrics
from sklearn.metrics import confusion_matrix, cohen_kappa_score

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

from sklearn.cluster import KMeans

import missingno as msno

from fancyimpute import IterativeImputer as MICE
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam 


from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from collections import Counter

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

from imblearn.over_sampling import KMeansSMOTE
from sklearn.mixture import GaussianMixture


from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from joblib import dump, load
import logging


In [2]:

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def split_dataset(dataset, target_column, test_size=0.2):
    
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    logging.info("Dataset has been split and returned")
    return X_train, X_test, y_train, y_test

def train_ann(X_train, y_train):
    
    start_time = time.time()
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(12, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
    end_time = time.time()

    logging.info(f"ANN has been trained in {end_time - start_time:.2f} seconds")
    return model

def train_models(X_train, y_train):
    
    models = {}
    param_grids = {
        'RandomForest': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        },
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        'LogisticRegression': {
            'C': [0.1, 1, 10],
            'penalty': ['l2']
        },
        'GradientBoosting': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7]
        },
        'KNN': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }

    models['ANN'] = train_ann(X_train, y_train)

    for model_name, param_grid in param_grids.items():
        start_time = time.time()
        try:
            if model_name == 'RandomForest':
                model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
            elif model_name == 'XGBoost':
                model = GridSearchCV(XGBClassifier(), param_grid, cv=5)
            elif model_name == 'SVM':
                model = GridSearchCV(SVC(probability=True), param_grid, cv=5)
            elif model_name == 'LogisticRegression':
                model = GridSearchCV(LogisticRegression(), param_grid, cv=5)
            elif model_name == 'GradientBoosting':
                model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
            elif model_name == 'KNN':
                model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

            model.fit(X_train, y_train)
            models[model_name] = model.best_estimator_
            end_time = time.time()
            logging.info(f"{model_name} has been trained in {end_time - start_time:.2f} seconds")
        except Exception as e:
            logging.error(f"Error training {model_name}: {e}")

    try:
        start_time = time.time()
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        models['NaiveBayes'] = nb
        end_time = time.time()
        logging.info(f"Naive Bayes has been trained in {end_time - start_time:.2f} seconds")
    except Exception as e:
        logging.error(f"Error training Naive Bayes: {e}")

    return models

def test_models(models, X_test):
    
    start_time = time.time()
    predictions = {}
    for name, model in models.items():
        try:
            if name == 'ANN':
                predictions[name] = (model.predict(X_test) > 0.5).astype("int32")
            else:
                predictions[name] = model.predict(X_test)
        except Exception as e:
            logging.error(f"Error testing {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been tested in {end_time - start_time:.2f} seconds")
    return predictions


def evaluate_models(models, predictions, y_test, X_test):
    
    start_time = time.time()
    metrics = {}
    
    for name, y_pred in predictions.items():
        try:
            accuracy = accuracy_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            
            if hasattr(models[name], "predict_proba"):
                auc = roc_auc_score(y_test, models[name].predict_proba(X_test)[:, 1])
            else:
                auc = roc_auc_score(y_test, models[name].predict(X_test))
            
            # Calculate specificity
            if cm.shape == (2, 2):
                tn, fp, fn, tp = cm.ravel()
                specificity = tn / (tn + fp)
            else:
                specificity = 0  # or handle the case appropriately
            
            # Calculate G-mean
            g_mean = np.sqrt(recall * specificity)
            
            # Calculate Kappa statistic
            kappa = cohen_kappa_score(y_test, y_pred)
            
            metrics[name] = {
                'accuracy': accuracy,
                'confusion_matrix': cm,
                'f1_score': f1,
                'precision': precision,
                'recall': recall,
                'auc_roc': auc,
                'g_mean': g_mean,
                'kappa': kappa
            }
        except Exception as e:
            logging.error(f"Error evaluating {name}: {e}")
    
    end_time = time.time()
    logging.info(f"Models have been evaluated in {end_time - start_time:.2f} seconds")
    
    return metrics


def explainability_shap(models, df_name, X_test, feature_names):

    # Ensure X_test is a DataFrame with named columns
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                explainer = shap.TreeExplainer(model)
            
            # No existing methods to analyse other models using SHAP, so only these three models.
            
            shap_values = explainer.shap_values(X_test)
            
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values[1] if isinstance(shap_values, list) else shap_values, 
                              X_test, plot_type="bar", show=False, max_display=10)
            plt.title(f"Top 10 Most Important Features - {name}")
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_shap_importance_{name}.png")
            plt.close()
            logging.info(f"SHAP explanations for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating SHAP explanations for {name}: {e}")



def explainability_lime(models, df_name, X_train, X_test, feature_names):
    
    
    # Ensure X_train and X_test are DataFrames with named columns
    X_train = pd.DataFrame(X_train, columns=feature_names).reset_index(drop=True)
    X_test = pd.DataFrame(X_test, columns=feature_names).reset_index(drop=True)
    
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train.values,  # Use .values to get numpy array
        feature_names=feature_names, 
        class_names=['Negative', 'Positive'], 
        mode='classification'
    )
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            i = np.random.randint(0, X_test.shape[0])
            exp = explainer.explain_instance(
                X_test.iloc[i].values,  # Use .iloc[i].values to get numpy array
                model.predict_proba, 
                num_features=6
            )
            feature_importance = pd.DataFrame(exp.as_list(), columns=['Feature', 'Importance'])
            feature_importance['Absolute Importance'] = abs(feature_importance['Importance'])
            feature_importance = feature_importance.sort_values('Absolute Importance', ascending=True)
            plt.figure(figsize=(10, 6))
            colors = ['red' if imp < 0 else 'green' for imp in feature_importance['Importance']]
            plt.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
            plt.title(f"LIME Explanation for {name}\nTop 6 Features' Impact on Prediction")
            plt.xlabel('Impact on Prediction (Red = Negative, Green = Positive)')
            plt.tight_layout()
            plt.savefig(f"C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Lime and shap graphs\\{df_name}_lime_explanation_{name}.png")
            plt.close()
            logging.info(f"LIME explanation for {name} created and saved")
        except Exception as e:
            logging.error(f"Error generating LIME explanations for {name}: {e}")



def interpret_results(models, X_test, feature_names):

    
    summary = "Model Interpretation Summary:\n\n"
    for name, model in models.items():
        if name == 'ANN':
            continue
        summary += f"{name} Model:\n"
        summary += f"Feature Importance from {name} Model:\n"
        try:
            if name in ['RandomForest', 'XGBoost', 'GradientBoosting']:
                importances = model.feature_importances_
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            else:
                importances = model.coef_[0] if hasattr(model, 'coef_') else None
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
                importance_df = importance_df.sort_values('Importance', ascending=False).head(10)
            summary += importance_df.to_string(index=False)
            summary += "\n\n"
        except Exception as e:
            logging.error(f"Error interpreting results for {name}: {e}")
    logging.info("Model interpretation summary created")
    return summary


def save_models(models, directory='models'):
    
    if not os.path.exists(directory):
        os.makedirs(directory)
    for name, model in models.items():
        try:
            if name == 'ANN':
                model.save(os.path.join(directory, f'{name}_model.h5'))
            else:
                dump(model, os.path.join(directory, f'{name}_model.joblib'))
            logging.info(f"{name} model saved")
        except Exception as e:
            logging.error(f"Error saving {name} model: {e}")


# Use only if needed to run back with best models
def load_models(directory='models'):
    
    models = {}
    for filename in os.listdir(directory):
        model_name, ext = os.path.splitext(filename)
        try:
            if ext == '.h5':
                models[model_name] = load_model(os.path.join(directory, filename))
            elif ext == '.joblib':
                models[model_name] = load(os.path.join(directory, filename))
            logging.info(f"{model_name} model loaded")
        except Exception as e:
            logging.error(f"Error loading {model_name} model: {e}")
    return models


def main(dataset, target_column, name):
    
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    logging.info("Data has been standardized")

    models = train_models(X_train, y_train)
    predictions = test_models(models, X_test)
    metrics = evaluate_models(models, predictions, y_test, X_test)

    explainability_shap(models, name, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    explainability_lime(models, name, X_train, X_test, feature_names=dataset.drop(columns=[target_column]).columns)

    # save_models(models)
    logging.info("Models have been saved")

    # Interpret results
    summary = interpret_results(models, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    print(summary)

    return metrics


def modelling_gs(df, name):
    
    target_column = 'LABEL'  # Replace with your target column
    results = main(df, target_column, name)
    logging.info("Results have been documented.")
    return results

# To run the modelling function with a dataset 'df':
# results = modelling_gs(df)

In [3]:
file_paths = [
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\AE_PCA.xlsx",
    "C:\\Users\\dev\\Desktop\\MSC thesis\\Code\\final_codes\\Processed Datasets\\MICE_PCA.xlsx"
]

# Read the Excel files into dataframes
dfs = [pd.read_excel(file_path) for file_path in file_paths]

print("Datasets are read into dataframes")

tot_start_time = time.time()
start_time = time.time()
# Store results 
results_AE_PCA = modelling_gs(dfs[0], "Primary_AE_PCA" )
end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by AE_PCA: {elapsed_time:.2f} mins")

start_time = time.time()
results_MICE_PCA = modelling_gs(dfs[1], "Primary_MICE_PCA")

end_time = time.time()  # End timing
elapsed_time = (end_time - start_time) / 60
print("_______________________________________________________________________________")
print(f" Total time taken by MICE_PCA: {elapsed_time:.2f} mins")



print(" ")
print("_______________________________________________________________________________")
tot_end_time = time.time()  # End timing
tot_elapsed_time = (tot_end_time - tot_start_time) / 60
print(f" Total time taken by all the models : {tot_elapsed_time:.2f} mins")

# Results
print("Results for AE_3_PCA:", results_AE_PCA)
print("Results for MICE_3_PCA:", results_MICE_PCA)


2024-09-03 04:56:41,640 - INFO - Dataset has been split and returned
2024-09-03 04:56:41,646 - INFO - Data has been standardized


Datasets are read into dataframes


2024-09-03 04:58:23,881 - INFO - ANN has been trained in 102.24 seconds
2024-09-03 05:05:18,810 - INFO - RandomForest has been trained in 414.93 seconds
2024-09-03 05:05:29,904 - INFO - XGBoost has been trained in 11.09 seconds
2024-09-03 05:16:20,895 - INFO - SVM has been trained in 650.99 seconds
2024-09-03 05:16:21,161 - INFO - LogisticRegression has been trained in 0.27 seconds
2024-09-03 05:43:08,757 - INFO - GradientBoosting has been trained in 1607.59 seconds
2024-09-03 05:43:10,212 - INFO - KNN has been trained in 1.45 seconds
2024-09-03 05:43:10,223 - INFO - Naive Bayes has been trained in 0.01 seconds


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  


2024-09-03 05:43:10,888 - INFO - Models have been tested in 0.66 seconds


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 699us/step


2024-09-03 05:43:11,452 - INFO - Models have been evaluated in 0.56 seconds
2024-09-03 05:43:33,331 - INFO - SHAP explanations for RandomForest created and saved
2024-09-03 05:43:33,909 - INFO - SHAP explanations for XGBoost created and saved
2024-09-03 05:43:34,409 - INFO - SHAP explanations for SVM created and saved
2024-09-03 05:43:34,907 - INFO - SHAP explanations for LogisticRegression created and saved
2024-09-03 05:43:42,294 - INFO - SHAP explanations for GradientBoosting created and saved
2024-09-03 05:43:49,898 - INFO - SHAP explanations for KNN created and saved
2024-09-03 05:43:57,132 - INFO - SHAP explanations for NaiveBayes created and saved
2024-09-03 05:43:57,539 - INFO - LIME explanation for RandomForest created and saved
2024-09-03 05:43:57,841 - INFO - LIME explanation for XGBoost created and saved
2024-09-03 05:43:58,368 - INFO - LIME explanation for SVM created and saved
2024-09-03 05:43:58,661 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
Liquidity_and_Coverage_Ratios_PC1    0.144505
              Leverage_Ratios_PC1    0.120815
Liquidity_and_Coverage_Ratios_PC2    0.106659
         Profitability_Ratios_PC2    0.064441
      Cost_and_Expense_Ratios_PC1    0.060198
         Profitability_Ratios_PC1    0.058815
             Cash_Flow_Ratios_PC2    0.056977
             Cash_Flow_Ratios_PC1    0.056600
      Cost_and_Expense_Ratios_PC2    0.052720
              Activity_Ratios_PC1    0.043983

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
Liquidity_and_Coverage_Ratios_PC1    0.246575
              Leverage_Ratios_PC1    0.128845
      Cost_and_Expense_Ratios_PC1    0.068272
Liquidity_and_Coverage_Ratios_PC2    0.062383
      Cost_and_Expense_Ratios_PC2    0.060024
              Activity_Ratios_PC1    0.055051
             Cash_Flow

2024-09-03 05:45:40,797 - INFO - ANN has been trained in 100.93 seconds
2024-09-03 05:53:05,573 - INFO - RandomForest has been trained in 444.78 seconds
2024-09-03 05:53:17,055 - INFO - XGBoost has been trained in 11.48 seconds
2024-09-03 06:05:44,348 - INFO - SVM has been trained in 747.29 seconds
2024-09-03 06:05:44,675 - INFO - LogisticRegression has been trained in 0.33 seconds
2024-09-03 06:33:15,400 - INFO - GradientBoosting has been trained in 1650.72 seconds
2024-09-03 06:33:16,556 - INFO - KNN has been trained in 1.16 seconds
2024-09-03 06:33:16,560 - INFO - Naive Bayes has been trained in 0.00 seconds


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  


2024-09-03 06:33:17,075 - INFO - Models have been tested in 0.52 seconds


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 708us/step


2024-09-03 06:33:17,604 - INFO - Models have been evaluated in 0.53 seconds
2024-09-03 06:33:29,748 - INFO - SHAP explanations for RandomForest created and saved
2024-09-03 06:33:30,792 - INFO - SHAP explanations for XGBoost created and saved
2024-09-03 06:33:31,451 - INFO - SHAP explanations for SVM created and saved
2024-09-03 06:33:32,040 - INFO - SHAP explanations for LogisticRegression created and saved
2024-09-03 06:33:41,096 - INFO - SHAP explanations for GradientBoosting created and saved
2024-09-03 06:33:50,144 - INFO - SHAP explanations for KNN created and saved
2024-09-03 06:33:59,221 - INFO - SHAP explanations for NaiveBayes created and saved
2024-09-03 06:33:59,621 - INFO - LIME explanation for RandomForest created and saved
2024-09-03 06:33:59,982 - INFO - LIME explanation for XGBoost created and saved
2024-09-03 06:34:00,569 - INFO - LIME explanation for SVM created and saved
2024-09-03 06:34:00,911 - INFO - LIME explanation for LogisticRegression created and saved
2024-

Model Interpretation Summary:

RandomForest Model:
Feature Importance from RandomForest Model:
                          Feature  Importance
Liquidity_and_Coverage_Ratios_PC1    0.138159
              Leverage_Ratios_PC1    0.118763
Liquidity_and_Coverage_Ratios_PC2    0.108096
         Profitability_Ratios_PC1    0.062745
             Cash_Flow_Ratios_PC2    0.061892
      Cost_and_Expense_Ratios_PC1    0.058386
      Cost_and_Expense_Ratios_PC2    0.057437
         Profitability_Ratios_PC2    0.054886
             Cash_Flow_Ratios_PC1    0.054583
              Activity_Ratios_PC1    0.048425

XGBoost Model:
Feature Importance from XGBoost Model:
                          Feature  Importance
              Leverage_Ratios_PC1    0.222067
Liquidity_and_Coverage_Ratios_PC1    0.192537
      Cost_and_Expense_Ratios_PC1    0.073834
Liquidity_and_Coverage_Ratios_PC2    0.064120
              Activity_Ratios_PC1    0.053594
      Cost_and_Expense_Ratios_PC2    0.051142
             Per_Share

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>