In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time
import os

import shap
from lime import lime_tabular

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

from sklearn.cluster import KMeans

import missingno as msno

from fancyimpute import IterativeImputer as MICE
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam 


from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from collections import Counter

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

from imblearn.over_sampling import KMeansSMOTE
from sklearn.mixture import GaussianMixture


from xgboost import XGBClassifier
from rgf.sklearn import RGFClassifier  # Regularized Greedy Forest
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from joblib import dump, load
import logging


In [6]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def split_dataset(dataset, target_column, test_size=0.2):
    """
    Split dataset into training and testing sets.

    Parameters:
    - dataset: pandas DataFrame
    - target_column: str, name of the target column
    - test_size: float, proportion of the dataset to include in the test split

    Returns:
    - X_train, X_test, y_train, y_test
    """
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    logging.info("Dataset has been split and returned")
    return X_train, X_test, y_train, y_test

def train_ann(X_train, y_train):
    """
    Train an Artificial Neural Network (ANN) on the training data.

    Parameters:
    - X_train: numpy array, training features
    - y_train: numpy array, training labels

    Returns:
    - model: trained ANN model
    """
    start_time = time.time()
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(12, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
    end_time = time.time()

    logging.info(f"ANN has been trained in {end_time - start_time:.2f} seconds")
    return model

def train_models(X_train, y_train):
    """
    Train multiple models on the training data.

    Parameters:
    - X_train: numpy array, training features
    - y_train: numpy array, training labels

    Returns:
    - models: dictionary of trained models
    """
    models = {}
    param_grids = {
        'RandomForest': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        },
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        'LogisticRegression': {
            'C': [0.1, 1, 10],
            'penalty': ['l2']
        },
        'GradientBoosting': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7]
        },
        'KNN': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }

    models['ANN'] = train_ann(X_train, y_train)

    for model_name, param_grid in param_grids.items():
        start_time = time.time()
        try:
            if model_name == 'RandomForest':
                model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
            elif model_name == 'XGBoost':
                model = GridSearchCV(XGBClassifier(), param_grid, cv=5)
            elif model_name == 'SVM':
                model = GridSearchCV(SVC(probability=True), param_grid, cv=5)
            elif model_name == 'LogisticRegression':
                model = GridSearchCV(LogisticRegression(), param_grid, cv=5)
            elif model_name == 'GradientBoosting':
                model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
            elif model_name == 'KNN':
                model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

            model.fit(X_train, y_train)
            models[model_name] = model.best_estimator_
            logging.info(f"{model_name} has been trained in {end_time - start_time:.2f} seconds")
        except Exception as e:
            logging.error(f"Error training {model_name}: {e}")
        end_time = time.time()

    try:
        start_time = time.time()
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        models['NaiveBayes'] = nb
        end_time = time.time()
        logging.info(f"Naive Bayes has been trained in {end_time - start_time:.2f} seconds")
    except Exception as e:
        logging.error(f"Error training Naive Bayes: {e}")

    return models

def test_models(models, X_test):
    """
    Test trained models on the test data.

    Parameters:
    - models: dictionary of trained models
    - X_test: numpy array, test features

    Returns:
    - predictions: dictionary of model predictions
    """
    start_time = time.time()
    predictions = {}
    for name, model in models.items():
        try:
            if name == 'ANN':
                predictions[name] = (model.predict(X_test) > 0.5).astype("int32")
            else:
                predictions[name] = model.predict(X_test)
        except Exception as e:
            logging.error(f"Error testing {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been tested in {end_time - start_time:.2f} seconds")
    return predictions

def evaluate_models(models, predictions, y_test, X_test):
    """
    Evaluate the performance of models.

    Parameters:
    - models: dictionary of trained models
    - predictions: dictionary of model predictions
    - y_test: numpy array, test labels
    - X_test: numpy array, test features

    Returns:
    - metrics: dictionary of evaluation metrics
    """
    start_time = time.time()
    metrics = {}
    for name, y_pred in predictions.items():
        try:
            accuracy = accuracy_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, models[name].predict_proba(X_test)[:, 1]) if name != 'ANN' else roc_auc_score(y_test, models[name].predict(X_test))
            metrics[name] = {
                'accuracy': accuracy,
                'confusion_matrix': cm,
                'f1_score': f1,
                'auc_roc': auc
            }
        except Exception as e:
            logging.error(f"Error evaluating {name}: {e}")
    end_time = time.time()

    logging.info(f"Models have been evaluated in {end_time - start_time:.2f} seconds")
    return metrics

def explainability_shap(models, X_test, feature_names):
    """
    Generate SHAP explanations for models.

    Parameters:
    - models: dictionary of trained models
    - X_test: numpy array, test features
    - feature_names: list of feature names
    """
    shap.initjs()
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            shap.summary_plot(shap_values, X_test, feature_names=feature_names)
            logging.info(f"SHAP summary plot for {name} created")
        except Exception as e:
            logging.error(f"Error generating SHAP explanations for {name}: {e}")

def explainability_lime(models, X_train, X_test, feature_names):
    """
    Generate LIME explanations for models.

    Parameters:
    - models: dictionary of trained models
    - X_train: numpy array, training features
    - X_test: numpy array, test features
    - feature_names: list of feature names
    """
    explainer = lime_tabular.LimeTabularExplainer(X_train, feature_names=feature_names, class_names=['class1', 'class2'], mode='classification')
    for name, model in models.items():
        if name == 'ANN':
            continue
        try:
            i = np.random.randint(0, X_test.shape[0])
            exp = explainer.explain_instance(X_test.iloc[i], model.predict_proba)
            exp.show_in_notebook(show_table=True)
            logging.info(f"LIME explanation for a sample of {name} created")
        except Exception as e:
            logging.error(f"Error generating LIME explanations for {name}: {e}")

def save_models(models, directory='models'):
    """
    Save trained models to disk.

    Parameters:
    - models: dictionary of trained models
    - directory: str, directory to save models
    """
    if not os.path.exists(directory):
        os.makedirs(directory)
    for name, model in models.items():
        try:
            if name == 'ANN':
                model.save(os.path.join(directory, f'{name}_model.h5'))
            else:
                dump(model, os.path.join(directory, f'{name}_model.joblib'))
            logging.info(f"{name} model saved")
        except Exception as e:
            logging.error(f"Error saving {name} model: {e}")

def load_models(directory='models'):
    """
    Load trained models from disk.

    Parameters:
    - directory: str, directory to load models from

    Returns:
    - models: dictionary of loaded models
    """
    models = {}
    for filename in os.listdir(directory):
        model_name, ext = os.path.splitext(filename)
        try:
            if ext == '.h5':
                models[model_name] = load_model(os.path.join(directory, filename))
            elif ext == '.joblib':
                models[model_name] = load(os.path.join(directory, filename))
            logging.info(f"{model_name} model loaded")
        except Exception as e:
            logging.error(f"Error loading {model_name} model: {e}")
    return models

def main(dataset, target_column):
    """
    Main function to train, test, evaluate, and explain models.

    Parameters:
    - dataset: pandas DataFrame
    - target_column: str, name of the target column

    Returns:
    - metrics: dictionary of evaluation metrics
    """
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    logging.info("Data has been standardized")

    models = train_models(X_train, y_train)
    predictions = test_models(models, X_test)
    metrics = evaluate_models(models, predictions, y_test, X_test)

    explainability_shap(models, X_test, feature_names=dataset.drop(columns=[target_column]).columns)
    explainability_lime(models, X_train, X_test, feature_names=dataset.drop(columns=[target_column]).columns)

    save_models(models)
    logging.info("Models have been saved")

    return metrics

def modelling_gs(df):
    """
    Function to run the main pipeline with the given dataset.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - results: dictionary of evaluation metrics
    """
    target_column = 'LABEL'  # Replace with your target column
    results = main(df, target_column)
    logging.info(results)
    return results

# To run the modelling function with a dataset 'df':
# results = modelling_gs(df)

In [7]:
df_mice = pd.read_excel("C:\\Users\\dev\\Desktop\\Msc thesis Prior RS\\ML training\\df_mice_labeled_after_PCA.xlsx")
df_AE = pd.read_excel("C:\\Users\\dev\\Desktop\\Msc thesis Prior RS\\ML training\\df_autoencoder_labeled_after_PCA.xlsx")

In [8]:
results_mice = modelling_gs(df_mice)
results_ae = modelling_gs(df_AE)

print("Results for df_mice")
print(f"{results_mice}")
print(" ")
print("__________________________________________________________________")
print(" ")
print("Results for df_AE")
print(f"{results_ae}")

2024-06-28 11:53:39,748 - INFO - Dataset has been split and returned


2024-06-28 11:53:39,759 - INFO - Data has been standardized
2024-06-28 11:56:20,861 - INFO - ANN has been trained in 161.10 seconds
2024-06-28 12:04:05,679 - ERROR - Error training RandomForest: cannot access local variable 'end_time' where it is not associated with a value
2024-06-28 12:04:18,450 - INFO - XGBoost has been trained in 0.00 seconds
2024-06-28 12:06:49,938 - INFO - SVM has been trained in 0.00 seconds
2024-06-28 12:06:50,340 - INFO - LogisticRegression has been trained in 0.00 seconds


KeyboardInterrupt: 