### Import modules

In [None]:
""" All modules for this steps of the pipeline are defined here. """


from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from PIL import Image
import optuna
import random
from tensorflow.keras.metrics import AUC

from _3_DataPrep_and_Cleaning_Part2 import run_data_prep_part2


In [None]:
global best_model_path
best_model_path = None

### Check GPU Availability

In [None]:
# Check for GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("GPUs available:", gpus)
else:
    print("No GPUs found. Training will use CPU.")

# Set memory growth for GPU
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

### Import Paths

In [None]:
#Seeds

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

""" Loading environment variables from .env file"""

load_dotenv()

base_path: str = os.getenv('base_path')
trials: str = os.getenv('trials')
trial_models: str = os.getenv('trial_models')
results: str = os.getenv('results')
model: str = os.getenv('model')
results: str = os.getenv('results')
amount_of_trials: int = int(os.getenv('amount_of_trials'))
autoencoder_loss = os.getenv('autoencoder_Loss')
autoencoder_metric = os.getenv('autoencoder_Metric')
mlp_loss = os.getenv('mlp_Loss')
mlp_metric = os.getenv('mlp_Metric')

# Print to check if paths are loaded correctly
print(f"Base Path: {base_path}")
print(f"Trials Path: {trials}")
print(f"Trials Models Path: {trial_models}")
print(f"Results Path: {results}")
print(f"Model Path: {model}")

if amount_of_trials == 0:
    amount_of_trials = 500

# Fallback to default if they are empty or None

autoencoder_loss = autoencoder_loss.strip() if autoencoder_loss else 'mse'
autoencoder_metric = autoencoder_metric.strip() if autoencoder_metric else 'mse'
mlp_loss = mlp_loss.strip() if mlp_loss else 'binary_crossentropy'
mlp_metric = mlp_metric.strip() if mlp_metric else 'accuracy'

# Convert comma-separated strings to lists
autoencoder_metric = [m.strip() for m in autoencoder_metric.split(',')] if ',' in autoencoder_metric else [autoencoder_metric]
mlp_metric = [m.strip() for m in mlp_metric.split(',')] if ',' in mlp_metric else [mlp_metric]

# Optionally handle 'auc' keyword for MLP metrics

mlp_metric = [AUC(name='auc') if m.lower() == 'auc' else m for m in mlp_metric]



In [None]:
""" Function to load training, validation, and testing data from CSV files."""

def data():

    # Load features using the base path
    X_train = pd.read_csv(f'{base_path}x_train.csv')
    X_val = pd.read_csv(f'{base_path}x_val.csv')
    X_test = pd.read_csv(f'{base_path}x_test.csv')

    # Load targets using the base path
    y_train = pd.read_csv(f'{base_path}y_train.csv')
    y_val = pd.read_csv(f'{base_path}y_val.csv')
    y_test = pd.read_csv(f'{base_path}y_test.csv')

    # Debug statements to check the shapes of the loaded data
    print('X_train shape:', X_train.shape)
    print('X_val shape:', X_val.shape)
    print('X_test shape:', X_test.shape)

    print('y_train shape:', y_train.shape)
    print('y_val shape:', y_val.shape)
    print('y_test shape:', y_test.shape)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
""" Function to plot and save model architecture and training results. """

def plot_results(history, supervised_autoencoder, location, trial_number):
    # Plot the model
    folder_path = location        

    # Create a new folder for the trial if it doesn't exist
    folder_name = "Trial_" + str(trial_number)
    new_folder_path = os.path.join(folder_path, folder_name)
    if not os.path.exists(new_folder_path):
        os.makedirs(new_folder_path)
        
    plot_model(supervised_autoencoder, to_file=os.path.join(new_folder_path, 'Model.png'), show_shapes=True, show_layer_names=True, expand_nested=True)
    print("Model saved to: ", new_folder_path)
    # Model Training and Validation loss
    plt.figure()
    plt.plot(history.history['loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend('Train', loc='upper left')
    plt.savefig(os.path.join(new_folder_path, 'Loss.png'))

    # Autoencoder Training and Validation accuracy
    plt.figure()
    plt.plot(history.history['MLP_Output_accuracy'])
    plt.plot(history.history['val_MLP_Output_accuracy'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.savefig(os.path.join(new_folder_path, 'MLP_Acc.png'))

    # Autoencoder MSE
    plt.figure()
    plt.plot(history.history['Autoencoder_Output_mse'])
    plt.plot(history.history['val_Autoencoder_Output_mse'])
    plt.title('Autoencoder MSE')
    plt.ylabel('MSE')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.savefig(os.path.join(new_folder_path, 'MSE.png'))

In [None]:
""" Function to save hyperparameters of the model trial. """

def save_hyperparameters(trial_number, learning_rate, num_epochs, num_layers_auto, num_layers_mlp, 
                         nodes_per_layer_autoencoder, nodes_per_layer_mlp, use_dropout, dropout_rate, codeNodes):
    # Create a folder for the model
    model_folder = f'{trial_models}/Model_{trial_number}/'
    os.makedirs(model_folder, exist_ok=True)
    
    # Save hyperparameters to a text file
    hyperparams_file = os.path.join(model_folder, 'hyperparameters.txt')
    with open(hyperparams_file, 'w') as f:
        f.write(f"Trial Number: {trial_number}\n")
        f.write(f"Learning Rate: {learning_rate}\n")
        f.write(f"Number of Epochs: {num_epochs}\n")
        f.write(f"Number of Layers (Autoencoder): {num_layers_auto}\n")
        f.write(f"Nodes per Layer (Autoencoder): {nodes_per_layer_autoencoder}\n")
        f.write(f"Number of Layers (MLP): {num_layers_mlp}\n")
        f.write(f"Nodes per Layer (MLP): {nodes_per_layer_mlp}\n")
        f.write(f"Use Dropout: {use_dropout}\n")
        f.write(f"Dropout Rate: {dropout_rate}\n")
        f.write(f"Code Layer Nodes: {codeNodes}\n")
    print(f"Saved hyperparameters as {hyperparams_file}")

In [None]:
""" Function to save the model architecture as a PNG image. """

def save_model_architecture(supervised_autoencoder, trial_number):
    model_folder = f'{trial_models}/Model_{trial_number}/'
    os.makedirs(model_folder, exist_ok=True)
    
    # Save the model architecture as a PNG image
    architecture_file = os.path.join(model_folder, 'model_architecture.png')
    plot_model(supervised_autoencoder, to_file=architecture_file, show_shapes=True, show_layer_names=True)
    print(f"Saved model architecture as {architecture_file}")


In [None]:
""" Function to plot and save training and validation accuracy and loss. """

def save_plots(history, trial_number, model_type):
    model_folder = f'{trial_models}/Model_{trial_number}/'
    os.makedirs(model_folder, exist_ok=True)
    
    acc = history.history['MLP_Output_accuracy']
    val_acc = history.history['val_MLP_Output_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    # Plot accuracy
    plt.figure(figsize=(8, 6))
    plt.plot(acc, label='Training Accuracy')
    plt.plot(val_acc, label='Validation Accuracy')
    plt.title(f'Training and Validation Accuracy - Model {trial_number}')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    accuracy_plot_path = f'{model_folder}{model_type}_accuracy_plot.png'
    plt.savefig(accuracy_plot_path)
    plt.close()

    # Plot loss
    plt.figure(figsize=(8, 6))
    plt.plot(loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.title(f'Training and Validation Loss - Model {trial_number}')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    loss_plot_path = f'{model_folder}{model_type}_loss_plot.png'
    plt.savefig(loss_plot_path)
    plt.close()

    print(f"Saved accuracy plot as {accuracy_plot_path}")
    print(f"Saved loss plot as {loss_plot_path}")

In [None]:
# Building the Supervised Autoencoder with additional model saving features
def build_supervised_autoencoder(X_train, X_val, y_train, y_val, learning_rate, num_epochs, num_layers_auto, num_layers_mlp, 
                                 nodes_per_layer_autoencoder, nodes_per_layer_mlp, trial_number, use_dropout, dropout_rate, codeNodes):

    print("After reshaping:")
    print("X_train shape:", X_train.shape)
    print("y_train shape:", y_train.shape)
    print("X_val shape:", X_val.shape)
    print("y_val shape:", y_val.shape)
    
    input_layer = Input(shape=(X_train.shape[1],), name="Input")

    # Encoder part stays the same
    encoder_layer = input_layer
    for i in range(num_layers_auto):
        encoder_layer = Dense(nodes_per_layer_autoencoder[i], activation="relu", name="Encode_layer_" + str(i))(encoder_layer)
        if use_dropout and i % 3 == 0:
            encoder_layer = Dropout(dropout_rate, name="dropout_layer_encoder_" + str(i))(encoder_layer)

    code_layer = Dense(codeNodes, activation="relu", name="Code_Layer")(encoder_layer)

    # Decoder part (reversed structure)
    decoder_layer = code_layer
    for i in reversed(range(num_layers_auto)):
        decoder_layer = Dense(nodes_per_layer_autoencoder[i], activation="relu", name="Decode_layer_" + str(i))(decoder_layer)
        if use_dropout and i % 3 == 0:
            decoder_layer = Dropout(dropout_rate, name="dropout_layer_decoder_" + str(i))(decoder_layer)

    # Output layer
    output_layer = Dense(X_train.shape[1], activation="linear", name="Autoencoder_Output")(decoder_layer)


    # Classifier
    MLP_Layer = code_layer
    for i in range(num_layers_mlp):
        MLP_Layer = Dense(nodes_per_layer_mlp[i], activation="relu", name="MLP_" + str(i))(MLP_Layer)
        
    # Classifier Output for Binary Classification
    output = Dense(1, activation="sigmoid", name="MLP_Output")(MLP_Layer)

    supervised_autoencoder = Model(input_layer, (output_layer, output))

    print("Using the following loss for Autoencoder: ", autoencoder_loss)
    print("Using the following metric for Autoencoder: ", autoencoder_metric)
    print("Using the following loss for MLP: ", mlp_loss)
    print("Using the following metric for MLP: ", mlp_metric)

    supervised_autoencoder.compile(
    optimizer=Adam(learning_rate=learning_rate),
    loss={
        'Autoencoder_Output': [autoencoder_loss],
        'MLP_Output': [mlp_loss]
    },
    metrics={
        'Autoencoder_Output': [autoencoder_metric],
        'MLP_Output': [mlp_metric]
    }
)

    early_stop_ae = EarlyStopping(monitor='val_Autoencoder_Output_mse', patience=30, mode='min', restore_best_weights=True)
    val_MLP_Output_accuracy = EarlyStopping(monitor='val_MLP_Output_accuracy', patience=30, restore_best_weights=True)


    # 3. Fit with named inputs/outputs so itâ€™s crystal-clear which target goes where
    history = supervised_autoencoder.fit(
        x=X_train,
        y={
            'Autoencoder_Output': X_train,
            'MLP_Output':         y_train
        },
        validation_data=(
            X_val,
            {
                'Autoencoder_Output': X_val,
                'MLP_Output':         y_val
            }
        ),
        epochs=num_epochs,
        batch_size=32,
        shuffle=True,
        callbacks=[early_stop_ae, val_MLP_Output_accuracy],
        verbose=1
    )

    # Get the keys from the history object
    keys = history.history.keys()

    # Open a text file and write the keys
    with open('history_keys.txt', 'w') as file:
        for key in keys:
            file.write(f"{key}\n")

    print("Keys have been saved to 'history_keys.txt'.")

    # Create a folder for this trial's model
    model_folder = f'{trial_models}/Model_{trial_number}/'
    os.makedirs(model_folder, exist_ok=True)

    # Save the model
    model = f'{model_folder}{trial_number}_Supervised_autoencoder_Churn.h5'
    supervised_autoencoder.save(model)
    print(f"Saved model as {model}")
    
    # Save the model architecture as an image
    save_model_architecture(supervised_autoencoder, trial_number)

    # Save the accuracy and loss plots
    save_plots(history, trial_number, "Supervised_autoencoder")

    # Save the hyperparameters used in a text file
    save_hyperparameters(trial_number, learning_rate, num_epochs, num_layers_auto, num_layers_mlp, 
                         nodes_per_layer_autoencoder, nodes_per_layer_mlp, use_dropout, dropout_rate, codeNodes)

    return history, supervised_autoencoder


In [None]:
""" Function for multi-objective hyperparameter optimization using Optuna."""

def multi_objective(trial, x_shape):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True)
    num_epochs = trial.suggest_int("num_epochs", 1, 500)
    num_layers_auto = trial.suggest_int("num_layers", 1, 10)
    num_layers_mlp = trial.suggest_int("num_layers_mlp", 1, 10)
    use_dropout = trial.suggest_categorical("use_dropout", [True, False])
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5)
    
    # Traditional Autoencoder
    nodes_per_layer_autoencoder = []
    upper_bound = 512
    
    for i in range(num_layers_auto):
        # Suggest the number of nodes for the current layer
        nodes = trial.suggest_int(f"nodes_per_layer_autoencoder_{i}", 2, upper_bound, log=True)
        nodes_per_layer_autoencoder.append(nodes)
        upper_bound = nodes  # set the upper bound for the next layer to be the number of nodes in the current layer

    nodes_per_layer_mlp = []
    for i in range(num_layers_mlp):
        # Suggest the number of nodes for the current layer
        nodes = trial.suggest_int(f"nodes_per_layer_mlp_{i}", 2, 512, log=True)
        nodes_per_layer_mlp.append(nodes)

    minNodes = min(nodes_per_layer_autoencoder)
    codeNodes = trial.suggest_int('nodes_in_code', 1, minNodes-1, step=1, log=True)

    trial.params['nodes_per_layer_autoencoder'] = nodes_per_layer_autoencoder
    trial.params['nodes_per_layer_mlp'] = nodes_per_layer_mlp

    X_train, X_val, X_test, y_train, y_val, y_test = data()

    # Build supervised autoencoder with suggested hyperparameters
    history, supervised_autoencoder = build_supervised_autoencoder(X_train, X_val, y_train, y_val, learning_rate, num_epochs, num_layers_auto, num_layers_mlp ,nodes_per_layer_autoencoder, nodes_per_layer_mlp ,trial.number, use_dropout, dropout_rate, codeNodes)

    val_accuracy = np.mean(history.history['val_MLP_Output_accuracy'])
    MLP_loss = np.mean(history.history['val_MLP_Output_loss'])
    autoencoder_loss = np.mean(history.history['val_Autoencoder_Output_loss'])

    # Save the trial number and the hyperparameters in a file
    with open(trials + "Churn_Trials.txt", "a") as f:
        f.write("Trial number:" + str(trial.number) + "\n")
        f.write("Learning rate:" + str(learning_rate) + "\n")
        f.write("Number of epochs:" + str(num_epochs) + "\n")
        f.write("Number of Autoencoder layers:" + str(num_layers_auto) + "\n")
        f.write("Nodes per layer Autoencoder:" + str(nodes_per_layer_autoencoder) + "\n")
        f.write("Nodes in code layer:" + str(codeNodes) + "\n")
        f.write("Number of MLP layers:" + str(num_layers_mlp) + "\n")
        f.write("Nodes per layer MLP:" + str(nodes_per_layer_mlp) + "\n")
        f.write("Use dropout:" + str(use_dropout) + "\n")
        f.write("Dropout rate:" + str(dropout_rate) + "\n")
        f.write("Validation accuracy:" + str(val_accuracy) + "\n")
        f.write("MLP_loss:" + str(MLP_loss) + "\n")
        f.write("autoencoder_loss:" + str(autoencoder_loss) + "\n")
        f.write("\n")

    overall_Loss = MLP_loss + autoencoder_loss

    # Store them in user attributes for later plotting
    trial.set_user_attr("val_MLP_Output_accuracy", val_accuracy)
    trial.set_user_attr("val_MLP_Output_loss", MLP_loss)
    
    return overall_Loss

In [None]:
""" Main function to build and train the model with hyperparameter optimization. """

def ModelBuildAndTrain():
    run_data_prep_part2()
    
    supervised_autoencoder = None

    # Load data to get the shape of X_train
    X_train, X_val, X_test, y_train, y_val, y_test = data()
    x_shape = X_train.shape[1]

    # Create a single-objective Optuna study
    study = optuna.create_study(direction='minimize', study_name="Supervised Autoencoder")
    study.optimize(lambda trial: multi_objective(trial, x_shape), n_trials=amount_of_trials)
    print("Number of finished trials: ", len(study.trials))

    # Retrieve the best trial
    best_trial = study.best_trial

    # Save best trial data to a text file
    with open(results + 'Best_Trials.txt', 'w') as file:
        file.write(f"Best Trial Number: {best_trial.number}\n")
        file.write(f"Best Trial Value: {best_trial.value}\n")
        file.write("Best Trial Parameters:\n")
        for key, value in best_trial.params.items():
            file.write(f"  {key}: {value}\n")

    # Extract validation accuracy and loss from user_attrs
    val_accuracies = [t.user_attrs.get("val_MLP_Output_accuracy") for t in study.trials]
    val_losses = [t.user_attrs.get("val_MLP_Output_loss") for t in study.trials]

    # Plot Validation Accuracy
    fig_acc = plt.figure(figsize=(10, 5))
    plt.plot(range(len(val_accuracies)), val_accuracies, marker='o')
    plt.title('Optuna Study - Validation Accuracy per Trial')
    plt.xlabel('Trial Number')
    plt.ylabel('Validation Accuracy')
    plt.grid(True)
    fig_acc.savefig(model + 'study_results_accuracy.png')

    # Plot Validation Loss
    fig_loss = plt.figure(figsize=(10, 5))
    plt.plot(range(len(val_losses)), val_losses, marker='o')
    plt.title('Optuna Study - Validation Loss per Trial')
    plt.xlabel('Trial Number')
    plt.ylabel('Validation Loss')
    plt.grid(True)
    fig_loss.savefig(model + 'study_results_loss.png')

    best_model_path = trial_models + f'Model_{best_trial.number}\{best_trial.number}_Supervised_autoencoder_Churn.h5'


    return best_model_path


In [None]:
if __name__ == "__main__":
    
    
    best_model_path = ModelBuildAndTrain()

    print("Best model path:", best_model_path)