In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

from sklearn.cluster import KMeans

import missingno as msno

from fancyimpute import IterativeImputer as MICE
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam 


from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from collections import Counter

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

from imblearn.over_sampling import KMeansSMOTE
from sklearn.mixture import GaussianMixture


from xgboost import XGBClassifier
from rgf.sklearn import RGFClassifier  # Regularized Greedy Forest
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from joblib import dump, load


In [7]:
def split_dataset(dataset, target_column, test_size=0.2):
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    print("Dataset has been split and returned")
    print(" ")
    return X_train, X_test, y_train, y_test

def train_ann(X_train, y_train):
    start_time = time.time()
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(12, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
    end_time = time.time()

    print(f"ANN has been trained in {end_time - start_time:.2f} seconds")
    print(" ")

    return model

def train_models(X_train, y_train):
    models = {}
    param_grids = {
        'RandomForest': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        },
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        'LogisticRegression': {
            'C': [0.1, 1, 10],
            'penalty': ['l2']
        },
        'GradientBoosting': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7]
        },
        'KNN': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }

    models['ANN'] = train_ann(X_train, y_train)

    for model_name, param_grid in param_grids.items():
        start_time = time.time()
        if model_name == 'RandomForest':
            model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
        elif model_name == 'XGBoost':
            model = GridSearchCV(XGBClassifier(), param_grid, cv=5)
        elif model_name == 'SVM':
            model = GridSearchCV(SVC(probability=True), param_grid, cv=5)
        elif model_name == 'LogisticRegression':
            model = GridSearchCV(LogisticRegression(), param_grid, cv=5)
        elif model_name == 'GradientBoosting':
            model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
        elif model_name == 'KNN':
            model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

        model.fit(X_train, y_train)
        models[model_name] = model.best_estimator_
        end_time = time.time()

        print(f"{model_name} has been trained in {end_time - start_time:.2f} seconds")
        print(" ")

    start_time = time.time()
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    models['NaiveBayes'] = nb
    end_time = time.time()

    print(f"Naive Bayes has been trained in {end_time - start_time:.2f} seconds")
    print(" ")

    return models

def test_models(models, X_test):
    start_time = time.time()
    predictions = {}
    for name, model in models.items():
        if name == 'ANN':
            predictions[name] = (model.predict(X_test) > 0.5).astype("int32")
        else:
            predictions[name] = model.predict(X_test)
    end_time = time.time()

    print(f"Models have been tested in {end_time - start_time:.2f} seconds")
    print(" ")

    return predictions

def evaluate_models(models, predictions, y_test, X_test):
    start_time = time.time()
    metrics = {}
    for name, y_pred in predictions.items():
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, models[name].predict_proba(X_test)[:, 1]) if name != 'ANN' else roc_auc_score(y_test, models[name].predict(X_test))
        metrics[name] = {
            'accuracy': accuracy,
            'confusion_matrix': cm,
            'f1_score': f1,
            'auc_roc': auc
        }
    end_time = time.time()

    print(f"Models have been evaluated in {end_time - start_time:.2f} seconds")
    print(" ")

    return metrics

def main(dataset, target_column):
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    print("Data has been standardized")
    print(" ")

    models = train_models(X_train, y_train)
    predictions = test_models(models, X_test)
    metrics = evaluate_models(models, predictions, y_test, X_test)

    # Save the models
    for name, model in models.items():
        if name != 'ANN':  # ANN model serialization handled differently
            dump(model, f'{name}_model.joblib')

    print("Models have been saved")
    print(" ")

    return metrics

def modelling_gs(df):
    target_column = 'LABEL'  # Replace with your target column
    results = main(df, target_column)
    print(" ")
    print(results)
    return results

In [8]:
df_mice = pd.read_excel("C:\\Users\\dev\\Desktop\\Msc thesis Prior RS\\ML training\\df_mice_labeled_after_PCA.xlsx")
df_AE = pd.read_excel("C:\\Users\\dev\\Desktop\\Msc thesis Prior RS\\ML training\\df_autoencoder_labeled_after_PCA.xlsx")

In [10]:
results_mice = modelling_gs(df_mice)
results_ae = modelling_gs(df_AE)

print("Results for df_mice")
print(f"{results_mice}")
print(" ")
print("__________________________________________________________________")
print(" ")
print("Results for df_AE")
print(f"{results_ae}")

Dataset has been split and returned
 
Data has been standardized
 
ANN has been trained in 326.71 seconds
 
RandomForest has been trained in 796.05 seconds
 
XGBoost has been trained in 16.22 seconds
 
SVM has been trained in 278.60 seconds
 
LogisticRegression has been trained in 0.65 seconds
 
GradientBoosting has been trained in 4850.38 seconds
 
KNN has been trained in 2.75 seconds
 
Naive Bayes has been trained in 0.01 seconds
 
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Models have been tested in 1.11 seconds
 
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 861us/step
Models have been evaluated in 1.07 seconds
 
Models have been saved
 
 
{'ANN': {'accuracy': 0.986583850931677, 'confusion_matrix': array([[1982,   30],
       [  24, 1989]], dtype=int64), 'f1_score': 0.9866071428571429, 'auc_roc': 0.998012916045703}, 'RandomForest': {'accuracy': 0.9935403726708074, 'confusion_matrix': array([[2007,    5],
       [  21, 1992]], dt