In [22]:
import os
import h2o
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss

from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators import H2ONaiveBayesEstimator
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.automl import H2OAutoML

# Carga de datos

In [23]:
df=pd.read_csv("../dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [24]:
df = df.drop(columns=["EmployeeCount", "StandardHours","EmployeeNumber", "Over18"])

In [25]:
# Columns with outliers (6)
cols_robust = [
    'MonthlyIncome', 'NumCompaniesWorked', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'
]

# Categorical columns (19)
cols_categorical = [ 
    'Education', 'EnvironmentSatisfaction', 'JobInvolvement',
    'JobLevel', 'JobSatisfaction', 'PerformanceRating',
    'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear',
    'WorkLifeBalance',
    'BusinessTravel', 'Department', 'EducationField',
    'Gender', 'JobRole', 'MaritalStatus',
    'OverTime',
]
# Numerical columns (8)
cols_numerical = [
    'Age', 'DailyRate', 'DistanceFromHome',
    'HourlyRate', 'MonthlyRate',
    'PercentSalaryHike', 'TotalWorkingYears'
]

In [26]:
preprocessor = ColumnTransformer(transformers=[
    ('robust', RobustScaler(), list(cols_robust)),
    ('standard', StandardScaler(), list(cols_numerical)),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), list(cols_categorical))
])

In [27]:
# Aplicar transformación
X = df.drop(columns=["Attrition"])
y = df["Attrition"]

X_transformed = preprocessor.fit_transform(X)

# Obtener nombres de columnas codificadas
ohe_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(cols_categorical)
final_feature_names = cols_robust + cols_numerical + list(ohe_names)

# Reconstruir DataFrame
X_processed = pd.DataFrame(X_transformed, columns=final_feature_names)
X_processed["Attrition"] = y.reset_index(drop=True)

### MODELADO

In [28]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,26 mins 50 secs
H2O_cluster_timezone:,America/Lima
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,3 months and 15 days
H2O_cluster_name:,H2O_from_python_Administrador_kx33pf
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.597 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [29]:
hf = h2o.H2OFrame(X_processed)
hf["Attrition"] = hf["Attrition"].asfactor()  # Convertir la clase objetivo

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [30]:
train, test = hf.split_frame(ratios=[0.8], seed=123)
x = hf.columns[:-1]  # todas menos Attrition
y = "Attrition"


## Regresion Logistica

In [31]:
glm = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, seed=123)
glm.train(x=x, y=y, training_frame=train)

print("Regresión Logística")
glm.model_performance(test_data=test).show()


glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Regresión Logística


Unnamed: 0,No,Yes,Error,Rate
No,235.0,22.0,0.0856,(22.0/257.0)
Yes,14.0,28.0,0.3333,(14.0/42.0)
Total,249.0,50.0,0.1204,(36.0/299.0)

metric,threshold,value,idx
max f1,0.3608948,0.6086957,49.0
max f2,0.2261977,0.6680162,78.0
max f0point5,0.5925575,0.6538462,21.0
max accuracy,0.5925575,0.8996656,21.0
max precision,0.9871924,1.0,0.0
max recall,0.0053585,1.0,263.0
max specificity,0.9871924,1.0,0.0
max absolute_mcc,0.5250825,0.5450193,31.0
max min_per_class_accuracy,0.2261977,0.7857143,78.0
max mean_per_class_accuracy,0.2261977,0.803363,78.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100334,0.9224387,7.1190476,7.1190476,1.0,0.9488109,1.0,0.9488109,0.0714286,0.0714286,611.9047619,611.9047619,0.0714286
2,0.0200669,0.8373026,7.1190476,7.1190476,1.0,0.8812802,1.0,0.9150455,0.0714286,0.1428571,611.9047619,611.9047619,0.1428571
3,0.0301003,0.7700294,4.7460317,6.3280423,0.6666667,0.8189588,0.8888889,0.8830166,0.047619,0.1904762,374.6031746,532.8042328,0.1865851
4,0.0401338,0.7458409,4.7460317,5.9325397,0.6666667,0.7591135,0.8333333,0.8520408,0.047619,0.2380952,374.6031746,493.2539683,0.2303131
5,0.0501672,0.7224994,7.1190476,6.1698413,1.0,0.7330793,0.8666667,0.8282485,0.0714286,0.3095238,611.9047619,516.984127,0.3017417
6,0.1003344,0.5409416,3.3222222,4.7460317,0.4666667,0.6052041,0.6666667,0.7167263,0.1666667,0.4761905,232.2222222,374.6031746,0.43728
7,0.1505017,0.3915092,2.3730159,3.9550265,0.3333333,0.4678646,0.5555556,0.6337724,0.1190476,0.5952381,137.3015873,295.5026455,0.5174171
8,0.2006689,0.3104532,2.3730159,3.5595238,0.3333333,0.3447797,0.5,0.5615242,0.1190476,0.7142857,137.3015873,255.952381,0.5975542
9,0.3010033,0.1972198,0.7119048,2.6103175,0.1,0.2450048,0.3666667,0.4560178,0.0714286,0.7857143,-28.8095238,161.031746,0.5639244
10,0.4013378,0.1215009,0.7119048,2.1357143,0.1,0.1534902,0.3,0.3803859,0.0714286,0.8571429,-28.8095238,113.5714286,0.5302946


## Random Forest

In [32]:
rf = H2ORandomForestEstimator(ntrees=100, max_depth=20, seed=123)
rf.train(x=x, y=y, training_frame=train)

print("Random Forest")
rf.model_performance(test_data=test).show()


drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Random Forest


Unnamed: 0,No,Yes,Error,Rate
No,228.0,29.0,0.1128,(29.0/257.0)
Yes,17.0,25.0,0.4048,(17.0/42.0)
Total,245.0,54.0,0.1538,(46.0/299.0)

metric,threshold,value,idx
max f1,0.28,0.5208333,31.0
max f2,0.21,0.6563707,39.0
max f0point5,0.4,0.5519481,18.0
max accuracy,0.645,0.8829431,6.0
max precision,0.86,1.0,0.0
max recall,0.06,1.0,62.0
max specificity,0.86,1.0,0.0
max absolute_mcc,0.22,0.4448331,38.0
max min_per_class_accuracy,0.22,0.7857143,38.0
max mean_per_class_accuracy,0.21,0.793867,39.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100334,0.7153,7.1190476,7.1190476,1.0,0.79,1.0,0.79,0.0714286,0.0714286,611.9047619,611.9047619,0.0714286
2,0.0200669,0.647,7.1190476,7.1190476,1.0,0.7033333,1.0,0.7466667,0.0714286,0.1428571,611.9047619,611.9047619,0.1428571
3,0.0301003,0.5715,4.7460317,6.3280423,0.6666667,0.6133333,0.8888889,0.7022222,0.047619,0.1904762,374.6031746,532.8042328,0.1865851
4,0.0401338,0.4962,0.0,4.7460317,0.0,0.5366667,0.6666667,0.6608333,0.0,0.1904762,-100.0,374.6031746,0.174912
5,0.0568562,0.49,2.847619,4.1876751,0.4,0.492,0.5882353,0.6111765,0.047619,0.2380952,184.7619048,318.767507,0.2108579
6,0.1036789,0.38,3.5595238,3.9039939,0.5,0.4110714,0.5483871,0.5208065,0.1666667,0.4047619,255.952381,290.3993856,0.3502872
7,0.1505017,0.313,2.542517,3.4804233,0.3571429,0.3435714,0.4888889,0.4656667,0.1190476,0.5238095,154.2517007,248.042328,0.4343154
8,0.2006689,0.254,1.8984127,3.0849206,0.2666667,0.2833333,0.4333333,0.4200833,0.0952381,0.6190476,89.8412698,208.4920635,0.4867519
9,0.3043478,0.21,1.8371736,2.6598639,0.2580645,0.2346774,0.3736264,0.3569231,0.1904762,0.8095238,83.7173579,165.9863946,0.5877339
10,0.4147157,0.16,0.4314574,2.0668203,0.0606061,0.1765152,0.2903226,0.3089113,0.047619,0.8571429,-56.8542569,106.6820276,0.5147304


## Gradient Boosting

In [33]:
gbm = H2OGradientBoostingEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=123)
gbm.train(x=x, y=y, training_frame=train)

print("Gradient Boosting")
gbm.model_performance(test_data=test).show()


gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Gradient Boosting


Unnamed: 0,No,Yes,Error,Rate
No,235.0,22.0,0.0856,(22.0/257.0)
Yes,19.0,23.0,0.4524,(19.0/42.0)
Total,254.0,45.0,0.1371,(41.0/299.0)

metric,threshold,value,idx
max f1,0.2449739,0.5287356,44.0
max f2,0.1039719,0.5928854,84.0
max f0point5,0.6886444,0.5909091,16.0
max accuracy,0.6886444,0.8896321,16.0
max precision,0.9973278,1.0,0.0
max recall,0.0108625,1.0,253.0
max specificity,0.9973278,1.0,0.0
max absolute_mcc,0.2449739,0.448978,44.0
max min_per_class_accuracy,0.0809151,0.7159533,103.0
max mean_per_class_accuracy,0.1039719,0.750139,84.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100334,0.9775282,7.1190476,7.1190476,1.0,0.9864805,1.0,0.9864805,0.0714286,0.0714286,611.9047619,611.9047619,0.0714286
2,0.0200669,0.8501521,7.1190476,7.1190476,1.0,0.9681371,1.0,0.9773088,0.0714286,0.1428571,611.9047619,611.9047619,0.1428571
3,0.0301003,0.7993207,4.7460317,6.3280423,0.6666667,0.8350605,0.8888889,0.9298927,0.047619,0.1904762,374.6031746,532.8042328,0.1865851
4,0.0401338,0.7465127,2.3730159,5.3392857,0.3333333,0.7757111,0.75,0.8913473,0.0238095,0.2142857,137.3015873,433.9285714,0.2026126
5,0.0501672,0.6982827,4.7460317,5.2206349,0.6666667,0.7305619,0.7333333,0.8591902,0.047619,0.2619048,374.6031746,422.0634921,0.2463406
6,0.1003344,0.3682986,2.847619,4.034127,0.4,0.5349802,0.5666667,0.6970852,0.1428571,0.4047619,184.7619048,303.4126984,0.3541782
7,0.1505017,0.2446214,2.847619,3.6386243,0.4,0.2867487,0.5111111,0.5603064,0.1428571,0.547619,184.7619048,263.8624339,0.4620159
8,0.2006689,0.1748157,0.9492063,2.9662698,0.1333333,0.2124909,0.4166667,0.4733525,0.047619,0.5952381,-5.0793651,196.6269841,0.4590513
9,0.3010033,0.100194,1.1865079,2.3730159,0.1666667,0.1281893,0.3333333,0.3582981,0.1190476,0.7142857,18.6507937,137.3015873,0.4808227
10,0.4013378,0.0621291,0.4746032,1.8984127,0.0666667,0.0784557,0.2666667,0.2883375,0.047619,0.7619048,-52.5396825,89.8412698,0.4194923


## DeepLearning

In [34]:
dl = H2ODeepLearningEstimator(hidden=[64, 32], epochs=20, seed=123)
dl.train(x=x, y=y, training_frame=train)

print("Deep Learning")
dl.model_performance(test_data=test).show()


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%
Deep Learning


Unnamed: 0,No,Yes,Error,Rate
No,244.0,13.0,0.0506,(13.0/257.0)
Yes,20.0,22.0,0.4762,(20.0/42.0)
Total,264.0,35.0,0.1104,(33.0/299.0)

metric,threshold,value,idx
max f1,0.6415803,0.5714286,34.0
max f2,0.0838473,0.6150794,83.0
max f0point5,0.9342378,0.6153846,21.0
max accuracy,0.9342378,0.8929766,21.0
max precision,0.9999186,1.0,0.0
max recall,0.0001426,1.0,230.0
max specificity,0.9999186,1.0,0.0
max absolute_mcc,0.6415803,0.5114742,34.0
max min_per_class_accuracy,0.0838473,0.7380952,83.0
max mean_per_class_accuracy,0.0838473,0.7659348,83.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100334,0.9998801,7.1190476,7.1190476,1.0,0.9999152,1.0,0.9999152,0.0714286,0.0714286,611.9047619,611.9047619,0.0714286
2,0.0200669,0.9981652,4.7460317,5.9325397,0.6666667,0.9993425,0.8333333,0.9996289,0.047619,0.1190476,374.6031746,493.2539683,0.1151566
3,0.0301003,0.9954596,2.3730159,4.7460317,0.3333333,0.997526,0.6666667,0.9989279,0.0238095,0.1428571,137.3015873,374.6031746,0.131184
4,0.0401338,0.9915618,7.1190476,5.3392857,1.0,0.9940529,0.75,0.9977092,0.0714286,0.2142857,611.9047619,433.9285714,0.2026126
5,0.0501672,0.9807682,4.7460317,5.2206349,0.6666667,0.9897744,0.7333333,0.9961222,0.047619,0.2619048,374.6031746,422.0634921,0.2463406
6,0.1003344,0.7774673,3.7968254,4.5087302,0.5333333,0.9002647,0.6333333,0.9481935,0.1904762,0.452381,279.6825397,350.8730159,0.4095794
7,0.1505017,0.4746033,1.8984127,3.6386243,0.2666667,0.5929675,0.5111111,0.8297848,0.0952381,0.547619,89.8412698,263.8624339,0.4620159
8,0.2006689,0.2384453,0.9492063,2.9662698,0.1333333,0.3475733,0.4166667,0.7092319,0.047619,0.5952381,-5.0793651,196.6269841,0.4590513
9,0.3010033,0.0686676,1.4238095,2.4521164,0.2,0.1350679,0.3444444,0.5178439,0.1428571,0.7380952,42.3809524,145.2116402,0.5085233
10,0.4013378,0.0187689,0.4746032,1.9577381,0.0666667,0.0345699,0.275,0.3970254,0.047619,0.7857143,-52.5396825,95.7738095,0.4471929


## AutoML

In [35]:
aml = H2OAutoML(max_models=10, seed=123, sort_metric="AUC", balance_classes=True)
aml.train(x=x, y=y, training_frame=train)

print("AutoML Leaderboard")
aml.leaderboard.head(rows=5)


AutoML progress: |
23:46:34.348: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
AutoML Leaderboard


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GLM_1_AutoML_2_20250712_234634,0.844543,0.319693,0.649783,0.216359,0.303827,0.092311
StackedEnsemble_BestOfFamily_1_AutoML_2_20250712_234634,0.841047,0.322873,0.649641,0.210714,0.30522,0.0931594
StackedEnsemble_AllModels_1_AutoML_2_20250712_234634,0.838538,0.323627,0.645122,0.207114,0.305604,0.0933939
GBM_1_AutoML_2_20250712_234634,0.791438,0.368031,0.548929,0.263538,0.331975,0.110208
GBM_3_AutoML_2_20250712_234634,0.764068,0.416668,0.483812,0.314305,0.350841,0.123089


In [36]:
print("Mejor modelo de AutoML:")
aml.leader.model_performance(test_data=test).show()


Mejor modelo de AutoML:


Unnamed: 0,No,Yes,Error,Rate
No,239.0,18.0,0.07,(18.0/257.0)
Yes,16.0,26.0,0.381,(16.0/42.0)
Total,255.0,44.0,0.1137,(34.0/299.0)

metric,threshold,value,idx
max f1,0.3681356,0.6046512,43.0
max f2,0.2624113,0.6779661,67.0
max f0point5,0.5241066,0.6506849,25.0
max accuracy,0.5241066,0.8996656,25.0
max precision,0.9697436,1.0,0.0
max recall,0.0092347,1.0,268.0
max specificity,0.9697436,1.0,0.0
max absolute_mcc,0.4784588,0.5414136,34.0
max min_per_class_accuracy,0.2397872,0.7857143,76.0
max mean_per_class_accuracy,0.2624113,0.8109135,67.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100334,0.8814708,7.1190476,7.1190476,1.0,0.9187521,1.0,0.9187521,0.0714286,0.0714286,611.9047619,611.9047619,0.0714286
2,0.0200669,0.8031327,7.1190476,7.1190476,1.0,0.8296197,1.0,0.8741859,0.0714286,0.1428571,611.9047619,611.9047619,0.1428571
3,0.0301003,0.7095426,4.7460317,6.3280423,0.6666667,0.785232,0.8888889,0.8445346,0.047619,0.1904762,374.6031746,532.8042328,0.1865851
4,0.0401338,0.6739822,7.1190476,6.5257937,1.0,0.7018128,0.9166667,0.8088541,0.0714286,0.2619048,611.9047619,552.5793651,0.2580137
5,0.0501672,0.6303084,4.7460317,6.1698413,0.6666667,0.6588457,0.8666667,0.7788525,0.047619,0.3095238,374.6031746,516.984127,0.3017417
6,0.1003344,0.492595,2.847619,4.5087302,0.4,0.5569538,0.6333333,0.6679031,0.1428571,0.452381,184.7619048,350.8730159,0.4095794
7,0.1505017,0.350679,3.3222222,4.1132275,0.4666667,0.4361811,0.5777778,0.5906625,0.1666667,0.6190476,232.2222222,311.3227513,0.5451177
8,0.2006689,0.3082302,1.8984127,3.5595238,0.2666667,0.3268459,0.5,0.5247083,0.0952381,0.7142857,89.8412698,255.952381,0.5975542
9,0.3010033,0.2017372,0.7119048,2.6103175,0.1,0.2464821,0.3666667,0.4319662,0.0714286,0.7857143,-28.8095238,161.031746,0.5639244
10,0.4013378,0.137161,0.9492063,2.1950397,0.1333333,0.1639208,0.3083333,0.3649549,0.0952381,0.8809524,-5.0793651,119.5039683,0.5579952


In [37]:
models = {
    "GLM": glm,
    "RandomForest": rf,
    "GBM": gbm,
    "DeepLearning": dl,
    "AutoML Leader": aml.leader
}

In [38]:
metricas = []

for nombre, modelo in models.items():
    perf = modelo.model_performance(test_data=test)
    
    acc    = perf.accuracy()[0][1]     # umbral óptimo
    f1     = perf.F1()[0][1]
    recall = perf.recall()[0][1]
    prec   = perf.precision()[0][1]
    auc    = perf.auc()

    metricas.append({
        "Modelo": nombre,
        "Accuracy": acc,
        "F1-score": f1,
        "Recall": recall,
        "Precision": prec,
        "AUC": auc
    })

In [39]:
df_metricas = pd.DataFrame(metricas)
print(df_metricas.sort_values("AUC", ascending=False).reset_index(drop=True))

          Modelo  Accuracy  F1-score  Recall  Precision       AUC
0  AutoML Leader  0.899666  0.604651     1.0        1.0  0.867241
1            GLM  0.899666  0.608696     1.0        1.0  0.860941
2   DeepLearning  0.892977  0.571429     1.0        1.0  0.838892
3   RandomForest  0.882943  0.520833     1.0        1.0  0.830091
4            GBM  0.889632  0.528736     1.0        1.0  0.818881
