In [1]:
#import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
#load the following packages
library(tidyquant)  # loads tidyverse and several other pkgs
library(readxl)     # super simple excel reader
library(h2o)        # professional grade ML pkg
library(lime)       # Explain complex black-box ML models

In [None]:
#read excel data
hr_data_raw <- read_excel(path = "data/WA_Fn-UseC_-HR_Employee-Attrition.xlsx")

In [None]:
#view first 10 rows
hr_data_raw[1:10,] %>%
    knitr::kable(caption = "first 10 rows")

In [None]:
hr_data = hr_data_raw %>%
   mutate_if(is.character, as.factor) %>%
    select(Attrition, everything())

In [None]:
glimpse(hr_data)

In [None]:
#initialize H20 JVM
h2o.init()

In [None]:
h2o.no_progress()  #turn off output of progress bars

In [None]:
#split data into train/validation/test sets
hr_data_h2o = as.h2o(hr_data)

split_h2o = h2o.splitframe(hr_data_h2o, c(0.7, 0.15), seed = 1234)

train_h2o = h2o.assign(split_h2o[[1]], "train")
valid_h2o = h2o.assign(split_h2o[[2]], "valid")
test_h2o  = h2o.assign(split_h2o[[3]], "test")

In [None]:
#set names for h2o
y = "Attrition"
x = setdiff(names(train_h2o), y)

In [None]:
# raun the automated machine learning
automl_models_h2o = h2o.automl(
     x = x
     y = y
    training_frame = train_h2o,
    leaderboard_frame = valid_h2o,
    max_runtime_secs = 30
)

In [None]:
#extract leader model
automl_leader = automl_models_h2o@leader

In [None]:
#predict on hold-out set, test h2o
pred_h2o = h2o.predict(object = automl_leader, newdata = test_h2o)

In [None]:
#prep for performance assessment
test_performance = test_h2o %>%
    tibble::as_tibble() %>%
    select(Attrition) %>%
    add_column(pred = as.vector(pred_h2o$predict)) %>%
    mutate_if(is.character, as.factor)
test_performance

In [None]:
#confusion table counts
confusion_matrix = test_performance %>%
     table()
confusion_matrix    

In [None]:
#performance analysis
tn = confusion_matrix[1]
tp = confusion_matrix[4]
fp = confusion_matrix[3]
fn = confusion_matrix[2]

accuracy = (tp + tn) / (tp + tn + fp +fn)
misclassfication_rate = 1 -accuracy
recall = tp / (tp + fn)
precision = tp / (tp + fp)
null_error_rate = tn / (tp + tn + fp + fn)

tibble{
    accuracy,
    misclassifcation_rate,
    recall,
    precision,
    null_error_rate
} %>%
    transpose()

In [None]:
class(automl_leader)

In [None]:
# setup lime:: model_type() function for h2o
model_type.H2OBinomialModel = function(x, ...) {
    
    return("Classification")
}

In [None]:
predict_model.H2OBinomialModel = function(x, newdata, type, ...) {
 pred = h2o.predict(x, as.h2o(newdata))
    #retun probs
    retun(as.data.frame(pred[,-1])
 }

In [None]:
#test the predict_model function
predict_model(x = automl_leader, newdata = as.data.frame(test_h2o[,-1]), type = 'raw') %>%
   tibble::as_tibble()

In [None]:
#run lime() on training set
explainer = lime::lime(
   as.data.frame(train_h2o[,-1]),
   model - automl_leader,
   bin_continous=FALSE)

In [None]:
#Run expalin() on explainer
explination = lime::explain(
   as.data.frame(test_h2o[1:10:-1]),
   explainer = explainer,
   n_labels = 1,
   n_features = 4,
   kernel_width = 0.5
)

In [None]:
#focus on critical features of attrition
attrition_critical_features = hr_data %>%
    tibble::as_tibble() %>%
    select(Attrition, TrainingsTimeLastYear, JobRole, OverTime) %>%
    rowid_to_column(var = "case")
attrition_critical_features