### 0. Imports

In [None]:
%load_ext autoreload
%autoreload 2

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import math

# Visualizaciones
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Vigilar progreso bucles
# -----------------------------------------------------------------------
from tqdm import tqdm

# Gestionar los warnings
# -----------------------------------------------------------------------
import warnings

# modificar el path
# -----------------------------------------------------------------------
import sys
sys.path.append("..")

# # importar funciones de soporte
# # -----------------------------------------------------------------------
import src.soporte_eda as se
import src.data_preparation as dp
# import src.soporte_ajuste_clasificacion as sac
import src.data_visualization_support as dvs
# import src.model_evaluation_support as mes


# preprocesamiento
# -----------------------------------------------------------------------
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# encoding
# -----------------------------------------------------------------------
from sklearn.preprocessing import  OneHotEncoder
import category_encoders as ce
# set category encoder compatibility
import sklearn
sklearn.set_config(transform_output="pandas")


# evaluar objetos literales
# -----------------------------------------------------------------------
from ast import literal_eval 

# statistics functions
# -----------------------------------------------------------------------
from scipy.stats import pearsonr, spearmanr, pointbiserialr


# definir semilla random
# -----------------------------------------------------------------------
seed = 42

import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier, KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, BatchNormalization, Dropout, Dense
from keras_tuner import RandomSearch, Objective

2025-01-09 11:09:23.670720: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-09 11:09:24.363522: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736417364.530240   72940 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736417364.564781   72940 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-09 11:09:24.892564: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

# 1. Introduction - Model Evaluation - Neural Network

In this notebook, the goal is not other than to expand the model evaluation carried out `notebooks/4_model_evaluation.ipynb`, by adding an evaluation of neural networks for this task.

# 2. Data import and preparation

## 2.1 Data preparation

Data preparation for load, cleaning and correction of errors found throughout the 1_exploration_cleaning, 2_EDA and 3_preprocessing notebooks have been integrated into a load and clean function, available at ``src/data_preparation.py``.

In [2]:
list_of_paths = ["../data/general_data.csv","../data/employee_survey_data.csv","../data/manager_survey_data.csv"]

employee_attrition = dp.load_and_clean(list_of_paths, "EmployeeID")
employee_attrition

  employee_attrition = employee_attrition.replace(repl_dict)


Unnamed: 0,age,attrition,businesstravel,department,distancefromhome,education,educationfield,gender,joblevel,jobrole,...,totalworkingyears,trainingtimeslastyear,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,environmentsatisfaction,jobsatisfaction,worklifebalance,jobinvolvement,performancerating
0,51,0,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,...,1.0,6,1.0,0,0,3.0,4.0,2.0,3,3
1,31,1,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,...,6.0,3,5.0,1,4,3.0,2.0,4.0,2,4
2,32,0,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,...,5.0,2,5.0,0,3,2.0,2.0,1.0,3,3
3,38,0,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,...,13.0,5,8.0,7,5,4.0,4.0,3.0,2,3
4,32,0,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,...,9.0,2,6.0,0,4,4.0,1.0,3.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4226,36,0,Travel_Rarely,Research & Development,2,3,Life Sciences,Male,2,Sales Representative,...,5.0,2,1.0,0,0,2.0,4.0,3.0,3,3
4305,45,0,Travel_Rarely,Sales,1,4,Technical Degree,Male,1,Research Scientist,...,20.0,3,17.0,0,10 or more,2.0,2.0,,3,3
4332,31,0,Travel_Rarely,Sales,2,5,Life Sciences,Male,2,Sales Representative,...,7.0,3,4.0,0,2,,4.0,3.0,4,3
4395,40,0,Travel_Rarely,Research & Development,2,3,Life Sciences,Male,1,Manufacturing Director,...,9.0,4,9.0,4,7,1.0,4.0,3.0,2,3


## 2.2 Define target and predictors

Define target and predictors, perform train-test split.

In [3]:
y = employee_attrition["attrition"]
X = employee_attrition.drop(columns="attrition")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)

Perform train and test split. Define a stratified kfold cross validation to improve metric robustness in face of target imbalance. Optimize for minimal average_precision_score, which is the equivalent of the area under the Precision-Recall curve, which is more informative than AUC for imbalanced datasets.

In [4]:
cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

score = "average_precision"

verbosity = 2

# 3. Model evaluation

## 3.1 Experiment 1 - baseline

### 3.1.1 Data preprocessing

- Missing: Median imputation for numerical features. Mode for categorical.
- Outliers: False.
- Encoding: One hot encoder for <= 3 categories. Target for others.
- Scaling: False. (Only tree based models will be used)

Setting the conditions for data preprocessing:

In [5]:
categorical = X.select_dtypes("object").columns.to_list()

# current numpy version doesn't work well with np.number for this:
numerical = X.select_dtypes(["int64","float64"]).columns.to_list()

all_columns = X.columns.to_list()

columns_one_hot = [col for col in categorical if X[col].nunique() <= 3]
columns_target =[col for col in categorical if X[col].nunique() > 3]


imputer = ColumnTransformer(
    transformers=[
        ('num_imputer', SimpleImputer(strategy='median'), numerical),
        ('cat_imputer', SimpleImputer(strategy='most_frequent'), categorical),
    ],
    remainder='drop',
    verbose_feature_names_out=False  # Prevents prefixing
)

scaler = MinMaxScaler()

# Combine into a pipeline to later append the model
def build_pipeline():
    return Pipeline([
        ('imputer', imputer),
        ('one-hot-encoder', ce.OneHotEncoder(cols=columns_one_hot, drop_invariant=True, return_df=True,use_cat_names=True)),
        ('target-encoder', ce.TargetEncoder(cols=columns_target, drop_invariant=False, return_df=True))
        # ('scaler', scaler)
    ])




Checking the preprocessing pipeline:

In [6]:
preprocessing_pipeline_target = build_pipeline()

In [7]:
preprocessing_pipeline_target.fit_transform(X, y)

Unnamed: 0,age,distancefromhome,monthlyincome,totalworkingyears,yearsatcompany,businesstravel_Travel_Rarely,businesstravel_Travel_Frequently,businesstravel_Non-Travel,department_Sales,department_Research & Development,...,stockoptionlevel,trainingtimeslastyear,yearssincelastpromotion,yearswithcurrmanager,environmentsatisfaction,jobsatisfaction,worklifebalance,jobinvolvement,performancerating_3,performancerating_4
0,51.0,6.0,131160.0,1.0,1.0,1,0,0,1,0,...,0.16568,0.057833,0.192,0.313793,0.139004,0.112245,0.170868,0.154341,1,0
1,31.0,10.0,41890.0,6.0,5.0,0,1,0,0,1,...,0.151326,0.172414,0.137931,0.116516,0.139004,0.165541,0.179487,0.156962,0,1
2,32.0,17.0,193280.0,5.0,5.0,0,1,0,0,1,...,0.166661,0.174873,0.192,0.14094,0.148026,0.165541,0.324999,0.154341,1,0
3,38.0,2.0,83210.0,13.0,8.0,0,0,1,0,1,...,0.166661,0.131783,0.19994,0.136817,0.133333,0.112245,0.140127,0.156962,1,0
4,32.0,10.0,23420.0,9.0,6.0,1,0,0,0,1,...,0.168675,0.174873,0.192,0.116516,0.133333,0.229236,0.140127,0.154341,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4226,36.0,2.0,43200.0,5.0,1.0,1,0,0,0,1,...,0.16568,0.174873,0.192,0.313793,0.148026,0.112245,0.140127,0.154341,1,0
4305,45.0,1.0,25800.0,20.0,17.0,1,0,0,1,0,...,0.166661,0.172414,0.192,0.057715,0.148026,0.165541,0.122662,0.154341,1,0
4332,31.0,2.0,27280.0,7.0,4.0,1,0,0,1,0,...,0.16568,0.172414,0.192,0.145604,0.168079,0.112245,0.140127,0.174194,1,0
4395,40.0,2.0,27180.0,9.0,9.0,1,0,0,0,1,...,0.151326,0.12782,0.073112,0.133047,0.247492,0.112245,0.140127,0.156962,1,0


### 3.1.2 Models to compare

The model to train and compare with previous metrics will be a Deep Neural Network. An evaluation of different configurations will be made.

### 3.1.3 Run experiment

Hyperparameters to tune, in order according to Andrew Ng in Deep Learning Specialization (I believe for Adam optimizer):
 Most important:
- Learning rate $\alpha$: the most important to tune.
  
 Second in importance:
- Momentum $\beta$: It can be set at $\beta \approx 0.9$.
- Mini-batch size
- n of hidden units

Third in importance:
- n of layers (I am unsure about this, as he said that deepening the layers is equivalent to using a huge number of hidden units)
- learning rate decay
- $\beta_1, \beta_2, \epsilon$ (0.9,0.999,$10^{-8}$): almost never touched.

In [9]:
#things to do
# [X] find a way to introduce regularization
# [X] add AUC-PR as a metric
# [X] tune hyperparameters for train/val/test
#   [X] regularization beta
#   [X] learning rate
#   [X] number of units
# [] tune hyperparameters in cross-validation scheme
# [] inspect the plots of AUC-PR and loss function of the hyperparameter tuning
# [] convert to numpy arrays or tf tensors after transformation
# [] document about the behaviour of the tuner with respect to early stopping and epoch score
# [] document snippets
# [] investigar arquitectura hidden units

def reduce_num_precision(X,y):
    for col in X.select_dtypes(include=["float"]).columns:
            X[col] = X[col].astype("float32")

    for col in X.select_dtypes(include=["int"]).columns:
            X[col] = X[col].astype("int32")

    y = y.astype("int32")
    
    return X, y


X_train_transformed = preprocessing_pipeline_target.fit_transform(X_train,y_train)

X_train_transformed, y_train = reduce_num_precision(X_train_transformed, y_train)

X_test_transformed = preprocessing_pipeline_target.transform(X_test)

X_test_transformed, y_test = reduce_num_precision(X_test_transformed, y_test)


def build_model():
    model = Sequential([
            Input(shape=(X_train_transformed.shape[1],)),
            BatchNormalization(),
            Dense(units=128,activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.5)),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=64,activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.5)),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=32,activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.5)),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=32,activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.5)),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=16,activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.5)),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=8,activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.5)),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=1,activation="sigmoid")
        ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                    loss="binary_crossentropy", 
                    metrics=[
                        tf.keras.metrics.AUC(curve="PR", name="auc_pr")  # AUC-PR metric
                    ])
    return model

model = build_model()

history = model.fit(X_train_transformed,y_train, validation_data=(X_test_transformed, y_test), epochs=100, batch_size=64)

I0000 00:00:1736417371.699213   72940 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3539 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Epoch 1/100


I0000 00:00:1736417377.259935   73047 service.cc:148] XLA service 0x7ff7b4014a10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1736417377.260538   73047 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4050 Laptop GPU, Compute Capability 8.9
2025-01-09 11:09:37.451737: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1736417378.105296   73047 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 8/20[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m0s[0m 8ms/step - auc_pr: 0.1533 - loss: 120.7923 

I0000 00:00:1736417382.306418   73047 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 402ms/step - auc_pr: 0.1595 - loss: 116.0910 - val_auc_pr: 0.1255 - val_loss: 95.0365
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - auc_pr: 0.1705 - loss: 89.4407 - val_auc_pr: 0.1676 - val_loss: 72.7040
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - auc_pr: 0.1790 - loss: 68.5195 - val_auc_pr: 0.1379 - val_loss: 55.6176
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - auc_pr: 0.1777 - loss: 52.4843 - val_auc_pr: 0.1298 - val_loss: 42.6603
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - auc_pr: 0.2642 - loss: 40.2743 - val_auc_pr: 0.1382 - val_loss: 32.8088
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - auc_pr: 0.1828 - loss: 31.0819 - val_auc_pr: 0.1522 - val_loss: 25.3307
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━

In [58]:
def build_model_tuner(hp):
    regularization_l2 = tf.keras.regularizers.l2(hp.Float("l2_regularization", min_value=1e-6, max_value=1e-1, 
                                                sampling="log")) # Logarithmic scale for L2,
    units1 = hp.Int("units1", min_value=16, max_value=256, step=16)
    units2 = hp.Int("units2", min_value=16, max_value=128, step=16)
    units3 = hp.Int("units3", min_value=8, max_value=64, step=8)
    units4 = hp.Int("units4", min_value=8, max_value=64, step=8)
    units5 = hp.Int("units5", min_value=4, max_value=32, step=4)
    units6 = hp.Int("units6", min_value=2, max_value=8, step=2)
    learning_rate = hp.Float("learning_rate", min_value=1e-6, max_value=1e-1, sampling="log") # Logarithmic scale for learning_rate,

    model = Sequential([
            Input(shape=(X_train_transformed.shape[1],)),
            BatchNormalization(),
            Dense(units=units1,
                activation="relu", 
                kernel_regularizer=regularization_l2),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=units2,
                activation="relu", 
                kernel_regularizer=regularization_l2),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=units3,
                activation="relu", 
                kernel_regularizer=regularization_l2),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=units4,
                activation="relu", 
                kernel_regularizer=regularization_l2),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=units5,
                activation="relu", 
                kernel_regularizer=regularization_l2),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=units6,
                activation="relu", 
                kernel_regularizer=regularization_l2),
            BatchNormalization(),
            Dropout(0.2),
            Dense(units=1,activation="sigmoid")
        ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer,
                    loss="binary_crossentropy", 
                    metrics=[
                        tf.keras.metrics.AUC(curve="PR", name="auc_pr")  # AUC-PR metric
                    ])
                  
    return model


# Configurar el buscador
tuner = RandomSearch(
    build_model_tuner,  # La función que construye el modelo
    objective=Objective("val_auc_pr", direction="max"),  # optimizar el 
    executions_per_trial=3,  # Número de veces que se entrena cada combinación
    directory="my_tuner",  # Directorio donde guardar resultados
    project_name="grid_search_demo"  # Nombre del proyecto
)

# Buscar los mejores hiperparámetros
tuner.search(X_train_transformed, 
             y_train,
            validation_data=(X_test_transformed, y_test),
            epochs=50,
            batch_size=64)
#
# Obtener los mejores hiperparámetros
best_hps = tuner.get_best_hyperparameters(num_trials=10)[0]

print(f"Mejor número de neuronas capa 1: {best_hps.get('units1')}")
print(f"Mejor número de neuronas capa 2: {best_hps.get('units2')}")
print(f"Mejor número de neuronas capa 3: {best_hps.get('units3')}")
print(f"Mejor número de neuronas capa 4: {best_hps.get('units4')}")
print(f"Mejor número de neuronas capa 5: {best_hps.get('units5')}")
print(f"Mejor número de neuronas capa 6: {best_hps.get('units6')}")

print(f"Mejor learning rate: {best_hps.get('learning_rate')}")
print(f"Mejor regularizacion l2: {best_hps.get('l2_regularization')}")

Reloading Tuner from my_tuner/grid_search_demo/tuner0.json
Mejor número de neuronas capa 1: 48
Mejor número de neuronas capa 2: 128
Mejor número de neuronas capa 3: 8
Mejor número de neuronas capa 4: 16
Mejor número de neuronas capa 5: 12
Mejor número de neuronas capa 6: 8
Mejor learning rate: 0.002557540247616812
Mejor regularizacion l2: 0.01270923424139016


### Cross-validation scheme

Convert pandas dataframes to numpy or tensorflow format.

In [55]:
# tensorflow, from numpy arrays
X_train_tensor = tf.convert_to_tensor(X_train_transformed.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train.values, dtype=tf.float32)

# numpy arrays from pandas df
X_train_np = X_train_transformed.values
y_train_np = y_train.values

Define a custom Cross-Validation tuner.

In [None]:
from keras_tuner.oracles import RandomSearchOracle
from keras_tuner.engine.tuner import Tuner

class CVTuner(Tuner):
    def __init__(self, oracle, hypermodel, cv_scheme, **kwargs):
        super().__init__(oracle=oracle, hypermodel=hypermodel, **kwargs)
        self.cv_scheme = cv_scheme
        self.hypermodel = hypermodel

    def run_trial(self, trial, X, y, batch_size=None, epochs=10):
        # Dynamically decide batch size
        batch_size = batch_size or trial.hyperparameters.Int("batch_size", min_value=16, max_value=128, step=16)
        
        val_scores = []

        for train_idx, val_idx in self.cv_scheme.split(X, y):
            try:
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]
            except:
                    print("Train indices:", train_idx)
                    print("Validation indices:", val_idx)
                    raise RuntimeError
            


            model = self.hypermodel(trial.hyperparameters)
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=epochs,
                batch_size=batch_size,
                verbose=0
            )
            val_scores.append(max(history.history['val_auc_pr']))

        mean_val_score = mean(val_scores)
        self.oracle.update_trial(trial.trial_id, {'val_auc_pr': mean_val_score})
        self.save_model(trial.trial_id, model)


# Initialize the custom tuner
tuner = CVTuner(
    oracle=RandomSearchOracle(
        objective=Objective("val_auc_pr", direction="max"),
        max_trials=10,
        seed=42
    ),
    hypermodel=build_model_tuner,
    directory='cv_tuner_dir',
    project_name='cv_tuner_project',
    cv_scheme=cross_val
)

# Start the hyperparameter search
tuner.search(X_train_np, y_train_np, epochs=40, batch_size=64)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
0.0015745         |0.0015745         |l2_regularization
16                |16                |units1
64                |64                |units2
24                |24                |units3
64                |64                |units4
12                |12                |units5
6                 |6                 |units6
2.1598e-06        |2.1598e-06        |learning_rate

