# Data Splitting

In [1]:
# Run this cell before continuing.
library(tidyverse)
library(repr)
library(infer)
library(gridExtra)
library(caret)
library(pROC)
library(boot)
library(glmnet)
library(broom)
library(leaps)
library(repr)
library(faraway)
library(mltools)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine


Loading required package: lattice


Attaching package: ‘caret’


Th

In [2]:
set.seed(123)
employee_dat <- read_csv("https://raw.githubusercontent.com/jtan29/stat-301-project/main/Employee.csv")
employee_dat <- employee_dat %>%
mutate(PaymentTier = as.factor(PaymentTier))
head(employee_dat)

[1mRows: [22m[34m4653[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): Education, City, Gender, EverBenched
[32mdbl[39m (5): JoiningYear, PaymentTier, Age, ExperienceInCurrentDomain, LeaveOrNot

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
<chr>,<dbl>,<chr>,<fct>,<dbl>,<chr>,<chr>,<dbl>,<dbl>
Bachelors,2017,Bangalore,3,34,Male,No,0,0
Bachelors,2013,Pune,1,28,Female,No,3,1
Bachelors,2014,New Delhi,3,38,Female,No,2,0
Masters,2016,Bangalore,3,27,Male,No,5,1
Masters,2017,Pune,3,24,Male,Yes,2,1
Bachelors,2016,Bangalore,3,22,Male,No,0,0


In [3]:
employee_dat$id <- 1:nrow(employee_dat) # add a new ID column

# 70% into train split
employee_train <- 
    employee_dat %>% 
    slice_sample(prop = 0.7)

# 30% into test split
employee_test <- 
    employee_dat %>% 
    anti_join(employee_train, by = "id") 

# drop the ID column
employee_train <- 
    employee_train %>% 
    select(-id)

employee_test <-
    employee_test %>%
    select(-id)

head(employee_train, 5)
head(employee_test, 5)

Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
<chr>,<dbl>,<chr>,<fct>,<dbl>,<chr>,<chr>,<dbl>,<dbl>
Bachelors,2015,Pune,3,28,Male,No,1,0
Bachelors,2014,Bangalore,3,29,Male,No,1,0
Masters,2017,New Delhi,3,26,Male,No,4,1
Bachelors,2015,Bangalore,3,25,Male,No,3,0
Masters,2017,New Delhi,2,31,Male,No,4,0


Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
<chr>,<dbl>,<chr>,<fct>,<dbl>,<chr>,<chr>,<dbl>,<dbl>
Bachelors,2014,New Delhi,3,38,Female,No,2,0
Bachelors,2016,Bangalore,3,22,Male,No,0,0
Bachelors,2016,Bangalore,3,34,Female,No,2,1
Masters,2017,New Delhi,2,37,Male,No,2,0
Bachelors,2016,Bangalore,3,39,Male,No,2,0


In [4]:
model_matrix_X_train <- 
    model.matrix(LeaveOrNot ~ ., employee_train)
model_matrix_X_train <- model_matrix_X_train[,-1]

matrix_Y_train <- 
    as.matrix(employee_train %>% select(LeaveOrNot), ncol = 1)

model_matrix_X_test <- 
    model.matrix(LeaveOrNot ~ ., employee_test)
model_matrix_X_test <- model_matrix_X_test[,-1]

matrix_Y_test <- 
    as.matrix(employee_test %>% select(LeaveOrNot), ncol = 1)

## LASSO Model

In [5]:
lasso_model = cv.glmnet(x = model_matrix_X_train,
                        y = matrix_Y_train,
                        family = "binomial",
                        type.measure = "auc",
                        nfolds = 10,
                        alpha = 1)

lasso_model


Call:  cv.glmnet(x = model_matrix_X_train, y = matrix_Y_train, type.measure = "auc",      nfolds = 10, family = "binomial", alpha = 1) 

Measure: AUC 

     Lambda Index Measure       SE Nonzero
min 0.00153    48  0.7314 0.010253      11
1se 0.01887    21  0.7219 0.009725       9

In [6]:
selected_variables <- as_tibble(as.matrix(coef(lasso_model, s = "lambda.1se")),
                                rownames='covariate')
selected_variables

covariate,s1
<chr>,<dbl>
(Intercept),-278.05479776
EducationMasters,0.30505363
EducationPHD,0.0
JoiningYear,0.13788571
CityNew Delhi,-0.09848657
CityPune,0.48510959
PaymentTier2,0.45049384
PaymentTier3,-0.15989502
Age,-0.01033995
GenderMale,-0.60691865


In [7]:
data_train <- employee_train |>
            select(-EverBenched, -ExperienceInCurrentDomain)
data_test <- employee_test |>
            select(-EverBenched, -ExperienceInCurrentDomain)
head(data_train)

Education,JoiningYear,City,PaymentTier,Age,Gender,LeaveOrNot
<chr>,<dbl>,<chr>,<fct>,<dbl>,<chr>,<dbl>
Bachelors,2015,Pune,3,28,Male,0
Bachelors,2014,Bangalore,3,29,Male,0
Masters,2017,New Delhi,3,26,Male,1
Bachelors,2015,Bangalore,3,25,Male,0
Masters,2017,New Delhi,2,31,Male,0
Bachelors,2013,Bangalore,3,26,Male,0


In [8]:
final_model <- glm(LeaveOrNot ~ ., family = "binomial", data = data_train)
tidy(final_model)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-399.75863022,46.058984158,-8.6792759,3.9829650000000004e-18
EducationMasters,0.63589156,0.112440947,5.655338,1.555399e-08
EducationPHD,0.06636736,0.220656257,0.3007726,0.7635879
JoiningYear,0.19870885,0.022869555,8.6887939,3.663075e-18
CityNew Delhi,-0.53912149,0.116679603,-4.620529,3.827629e-06
CityPune,0.52281738,0.101765642,5.1374646,2.7847e-07
PaymentTier2,0.52491356,0.188188716,2.7892935,0.005282318
PaymentTier3,-0.19688926,0.172263193,-1.1429561,0.2530568
Age,-0.03308748,0.008403318,-3.9374306,8.235875e-05
GenderMale,-0.85100011,0.084473731,-10.074139,7.188766e-24


In [9]:
prediction = round(predict(final_model, newdata = data_test |> select(-LeaveOrNot), type = "response"), 0)
head(prediction)

In [10]:
conf_matrix <- 
    confusionMatrix(
    data = as.factor(prediction),
    reference = data_test |> select(LeaveOrNot) |> unlist() |> as.factor(),
    positive = "1"
)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 820 289
         1  87 200
                                          
               Accuracy : 0.7307          
                 95% CI : (0.7066, 0.7538)
    No Information Rate : 0.6497          
    P-Value [Acc > NIR] : 5.891e-11       
                                          
                  Kappa : 0.346           
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.4090          
            Specificity : 0.9041          
         Pos Pred Value : 0.6969          
         Neg Pred Value : 0.7394          
             Prevalence : 0.3503          
         Detection Rate : 0.1433          
   Detection Prevalence : 0.2056          
      Balanced Accuracy : 0.6565          
                                          
       'Positive' Class : 1               
                              

In [11]:
lasso_acc <- conf_matrix$overall['Accuracy']
lasso_acc

In [12]:
results <- data.frame(model = c("LASSO Selection OLS"),  accuracy = c(lasso_acc))
results

Unnamed: 0_level_0,model,accuracy
Unnamed: 0_level_1,<chr>,<dbl>
Accuracy,LASSO Selection OLS,0.730659


# LASSO Direct

In [13]:
employee_lasso_max_AUC <- 
  glmnet(
  x = model_matrix_X_train, y = matrix_Y_train,
  alpha = 1,
  family = "binomial",
  type.measure = "auc",
  lambda = lasso_model$lambda.1se
)

coef(employee_lasso_max_AUC)

12 x 1 sparse Matrix of class "dgCMatrix"
                                     s0
(Intercept)               -278.11649213
EducationMasters             0.30519184
EducationPHD                 .         
JoiningYear                  0.13791673
CityNew Delhi               -0.09851312
CityPune                     0.48513188
PaymentTier2                 0.44943798
PaymentTier3                -0.16064489
Age                         -0.01034093
GenderMale                  -0.60697109
EverBenchedYes               0.28418565
ExperienceInCurrentDomain    .         

In [14]:
employee_class <- 
  predict(employee_lasso_max_AUC, model_matrix_X_test, type = "response") %>% round(0)
head(employee_class, 10)

Unnamed: 0,s0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
10,0


In [15]:
conf_matrix <- 
    confusionMatrix(
    data = as.factor(employee_class),
    reference = as.factor(employee_test$LeaveOrNot),
    positive = "1"
)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 839 329
         1  68 160
                                          
               Accuracy : 0.7156          
                 95% CI : (0.6912, 0.7392)
    No Information Rate : 0.6497          
    P-Value [Acc > NIR] : 9.155e-08       
                                          
                  Kappa : 0.2876          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.3272          
            Specificity : 0.9250          
         Pos Pred Value : 0.7018          
         Neg Pred Value : 0.7183          
             Prevalence : 0.3503          
         Detection Rate : 0.1146          
   Detection Prevalence : 0.1633          
      Balanced Accuracy : 0.6261          
                                          
       'Positive' Class : 1               
                              

In [16]:
lasso_dir_acc <- conf_matrix$overall['Accuracy']
lasso_dir_acc

In [17]:
results <- results %>% add_row(model = "LASSO Direct", accuracy = lasso_dir_acc)
results

Unnamed: 0_level_0,model,accuracy
Unnamed: 0_level_1,<chr>,<dbl>
Accuracy,LASSO Selection OLS,0.730659
...2,LASSO Direct,0.715616


## OLS with Forward Selection Model

In [18]:
##Since some categorical variables are present numeric ways,use factor() to transform them into categorical form.
ly_employee_train <- data.frame(employee_train)
ly_employee_test <- data.frame(employee_test)
ly_employee_train$PaymentTier<-factor(ly_employee_train$PaymentTier, levels = c(1, 2, 3), labels = c("Low", "Median", "High"),ordered = TRUE)
ly_employee_train$LeaveOrNot<-factor(ly_employee_train$LeaveOrNot, levels = c(0, 1), labels = c("Not Leave", "Leave"), ordered = TRUE)
head(ly_employee_train)
ly_employee_test$PaymentTier<-factor(ly_employee_test$PaymentTier, levels = c(1, 2, 3), labels = c("Low", "Median", "High"),ordered = TRUE)
ly_employee_test$LeaveOrNot<-factor(ly_employee_test$LeaveOrNot, levels = c(0, 1), labels = c("Not Leave", "Leave"), ordered = TRUE)
head(ly_employee_test)


Unnamed: 0_level_0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<ord>,<dbl>,<chr>,<chr>,<dbl>,<ord>
1,Bachelors,2015,Pune,High,28,Male,No,1,Not Leave
2,Bachelors,2014,Bangalore,High,29,Male,No,1,Not Leave
3,Masters,2017,New Delhi,High,26,Male,No,4,Leave
4,Bachelors,2015,Bangalore,High,25,Male,No,3,Not Leave
5,Masters,2017,New Delhi,Median,31,Male,No,4,Not Leave
6,Bachelors,2013,Bangalore,High,26,Male,No,4,Not Leave


Unnamed: 0_level_0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<ord>,<dbl>,<chr>,<chr>,<dbl>,<ord>
1,Bachelors,2014,New Delhi,High,38,Female,No,2,Not Leave
2,Bachelors,2016,Bangalore,High,22,Male,No,0,Not Leave
3,Bachelors,2016,Bangalore,High,34,Female,No,2,Leave
4,Masters,2017,New Delhi,Median,37,Male,No,2,Not Leave
5,Bachelors,2016,Bangalore,High,39,Male,No,2,Not Leave
6,Bachelors,2012,Bangalore,High,37,Male,No,4,Not Leave


In [19]:
###using forward selection to find the variables to best fit predict modeling
employee_forward_sel <- regsubsets(x = LeaveOrNot ~ ., nvmax = NULL,
                                  data = ly_employee_train,
                                  method = "forward")

employee_forward_summary <- summary(employee_forward_sel)
employee_forward_summary

Subset selection object
Call: regsubsets.formula(x = LeaveOrNot ~ ., nvmax = NULL, data = ly_employee_train, 
    method = "forward")
11 Variables  (and intercept)
                          Forced in Forced out
EducationMasters              FALSE      FALSE
EducationPHD                  FALSE      FALSE
JoiningYear                   FALSE      FALSE
CityNew Delhi                 FALSE      FALSE
CityPune                      FALSE      FALSE
PaymentTier.L                 FALSE      FALSE
PaymentTier.Q                 FALSE      FALSE
Age                           FALSE      FALSE
GenderMale                    FALSE      FALSE
EverBenchedYes                FALSE      FALSE
ExperienceInCurrentDomain     FALSE      FALSE
1 subsets of each size up to 11
Selection Algorithm: forward
          EducationMasters EducationPHD JoiningYear CityNew Delhi CityPune
1  ( 1 )  " "              " "          " "         " "           " "     
2  ( 1 )  " "              " "          " "         " "      

In [20]:
##store and examine different evaluation metrics to determine the best one in terms of its goodness of fit.
employee_forward_summary_df <- tibble(
    n_input_variables = 1:11,
    RSQ = employee_forward_summary$rsq,
    RSS = employee_forward_summary$rss,
    ADJ_R2 = employee_forward_summary$adjr2,
    Cp = employee_forward_summary$cp,
    BIC = employee_forward_summary$bic,
)
employee_forward_summary_df

n_input_variables,RSQ,RSS,ADJ_R2,Cp,BIC
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.06544388,684.1186,0.06515677,313.01063,-204.2684
2,0.08796898,667.6297,0.08740842,229.06103,-275.643
3,0.10757788,653.2755,0.10675486,156.23883,-338.3445
4,0.12504012,640.4927,0.12396391,91.6077,-394.6183
5,0.1309682,636.1532,0.12963164,70.98776,-408.6718
6,0.13700853,631.7315,0.13541531,49.93951,-423.3005
7,0.14322972,627.1774,0.1413838,28.20113,-438.7763
8,0.14711021,624.3368,0.1450095,15.39425,-445.4729
9,0.14899282,622.9587,0.14663401,10.21071,-444.5816
10,0.1494145,622.65,0.14679409,10.60171,-438.1073


In [21]:
#select the model that minimizes the Cp and present its predictor variables
cp_min = which.min(employee_forward_summary$cp) 

selected_var <- names(coef(employee_forward_sel, cp_min))[-1]
selected_var

In [22]:
##rearrange the training dataset to ensure it contain the best fiting variables
rearrange_training <- ly_employee_train |>
mutate(EducationMasters = ifelse(Education == "Masters", "Yes", "No"))|>
mutate(CityNewDelhi = ifelse(City == "New Delhi", "Yes", "No"))|>
mutate(CityPune = ifelse(City == "Pune", "Yes", "No"))|>
mutate(PaymentTier.Q = ifelse(PaymentTier == "Median", "Yes", "No"))
head(rearrange_training)

Unnamed: 0_level_0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot,EducationMasters,CityNewDelhi,CityPune,PaymentTier.Q
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<ord>,<dbl>,<chr>,<chr>,<dbl>,<ord>,<chr>,<chr>,<chr>,<chr>
1,Bachelors,2015,Pune,High,28,Male,No,1,Not Leave,No,No,Yes,No
2,Bachelors,2014,Bangalore,High,29,Male,No,1,Not Leave,No,No,No,No
3,Masters,2017,New Delhi,High,26,Male,No,4,Leave,Yes,Yes,No,No
4,Bachelors,2015,Bangalore,High,25,Male,No,3,Not Leave,No,No,No,No
5,Masters,2017,New Delhi,Median,31,Male,No,4,Not Leave,Yes,Yes,No,Yes
6,Bachelors,2013,Bangalore,High,26,Male,No,4,Not Leave,No,No,No,No


In [23]:
##employing logistic regression, using binomial distribution
rearrange_training_log <- 
    glm(formula = LeaveOrNot ~ EducationMasters+JoiningYear+CityNewDelhi+
        CityPune+PaymentTier.Q+Age+Gender+EverBenched+ExperienceInCurrentDomain,
        data = rearrange_training,
        family = binomial)

summary(rearrange_training_log)


Call:
glm(formula = LeaveOrNot ~ EducationMasters + JoiningYear + CityNewDelhi + 
    CityPune + PaymentTier.Q + Age + Gender + EverBenched + ExperienceInCurrentDomain, 
    family = binomial, data = rearrange_training)

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)               -3.871e+02  4.613e+01  -8.392  < 2e-16 ***
EducationMastersYes        6.539e-01  1.112e-01   5.879 4.13e-09 ***
JoiningYear                1.925e-01  2.290e-02   8.404  < 2e-16 ***
CityNewDelhiYes           -5.292e-01  1.154e-01  -4.585 4.53e-06 ***
CityPuneYes                5.235e-01  1.019e-01   5.137 2.79e-07 ***
PaymentTier.QYes           7.146e-01  1.097e-01   6.516 7.23e-11 ***
Age                       -3.566e-02  8.533e-03  -4.180 2.92e-05 ***
GenderMale                -8.732e-01  8.438e-02 -10.349  < 2e-16 ***
EverBenchedYes             5.765e-01  1.255e-01   4.595 4.32e-06 ***
ExperienceInCurrentDomain -7.030e-02  2.616e-02  -2.687  0.00721 ** 
---
S

In [24]:
##rearrange the testing dataset to ensure it contain the best fiting variables
rearrange_testing<-ly_employee_test|>
mutate(EducationMasters = ifelse(Education == "Masters", "Yes", "No"))|>
mutate(CityNewDelhi = ifelse(City == "New Delhi", "Yes", "No"))|>
mutate(CityPune = ifelse(City == "Pune", "Yes", "No"))|>
mutate(PaymentTier.Q = ifelse(PaymentTier == "Median", "Yes", "No"))

                              

In [25]:
##Use the resulting predictive values to compute the error and the RMSE of the predictive values based on training set
predicted_probabilities_training <- predict(rearrange_training_log, 
                                   newdata=rearrange_training,
                                   type = "response")


rearrange_training<-mutate(rearrange_training,
                          LeaveOrNot_P = ifelse(LeaveOrNot == "Leave", 1, 0))

#true probability of leave
p_true_training <- rearrange_training$LeaveOrNot_P

#calculate residuals
residuals_training <- p_true_training - predicted_probabilities_training

rmse_red_glm_training<-sqrt(mean(residuals_training^2))

rmse_red_glm_training

In [26]:
prediction = round(predict(rearrange_training_log, newdata = rearrange_testing, type = "response"), 0)
head(prediction)

In [27]:
conf_matrix <- 
    confusionMatrix(
    data = as.factor(prediction),
    reference = as.factor(employee_test$LeaveOrNot),
    positive = "1"
)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 825 289
         1  82 200
                                          
               Accuracy : 0.7342          
                 95% CI : (0.7102, 0.7573)
    No Information Rate : 0.6497          
    P-Value [Acc > NIR] : 8.142e-12       
                                          
                  Kappa : 0.353           
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.4090          
            Specificity : 0.9096          
         Pos Pred Value : 0.7092          
         Neg Pred Value : 0.7406          
             Prevalence : 0.3503          
         Detection Rate : 0.1433          
   Detection Prevalence : 0.2020          
      Balanced Accuracy : 0.6593          
                                          
       'Positive' Class : 1               
                              

In [28]:
ols_forward_acc <- conf_matrix$overall['Accuracy']
ols_forward_acc

In [29]:
results <- results %>% add_row(model = "OLS w/Forward Selection", accuracy = ols_forward_acc)
results

Unnamed: 0_level_0,model,accuracy
Unnamed: 0_level_1,<chr>,<dbl>
Accuracy,LASSO Selection OLS,0.730659
...2,LASSO Direct,0.715616
...3,OLS w/Forward Selection,0.7342407


## Ridge Model

In [30]:
employee_cv_lambda_ridge <- 
  cv.glmnet(
       x = model_matrix_X_train, 
       y = matrix_Y_train,
       alpha = 0,
       family = "binomial",
       type.measure = "auc",
       nfolds = 10)

employee_cv_lambda_ridge


Call:  cv.glmnet(x = model_matrix_X_train, y = matrix_Y_train, type.measure = "auc",      nfolds = 10, alpha = 0, family = "binomial") 

Measure: AUC 

    Lambda Index Measure       SE Nonzero
min 0.0121   100  0.7310 0.006852      11
1se 0.5011    60  0.7242 0.008972      11

In [31]:
employee_ridge_max_AUC <- 
  glmnet(
  x = model_matrix_X_train, y = matrix_Y_train,
  alpha = 0,
  family = "binomial",
  lambda = employee_cv_lambda_ridge$lambda.min
)

coef(employee_ridge_max_AUC)

12 x 1 sparse Matrix of class "dgCMatrix"
                                     s0
(Intercept)               -361.80222684
EducationMasters             0.59466898
EducationPHD                 0.08283041
JoiningYear                  0.17993174
CityNew Delhi               -0.46870155
CityPune                     0.50088821
PaymentTier2                 0.49153196
PaymentTier3                -0.24690636
Age                         -0.03249241
GenderMale                  -0.79555833
EverBenchedYes               0.54868707
ExperienceInCurrentDomain   -0.06389274

In [32]:
employee_class <- 
  predict(employee_ridge_max_AUC, model_matrix_X_train, type = "response") %>% round(0)
head(employee_class, 5)

Unnamed: 0,s0
1,0
2,0
3,0
4,0
5,0


In [33]:
employee_confusion_matrix <- 
    confusionMatrix(
    data = as.factor(employee_class),
    reference = as.factor(employee_train$LeaveOrNot),
    positive = "1"
)

employee_confusion_matrix

Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1934  691
         1  212  420
                                         
               Accuracy : 0.7228         
                 95% CI : (0.707, 0.7381)
    No Information Rate : 0.6589         
    P-Value [Acc > NIR] : 3.106e-15      
                                         
                  Kappa : 0.3117         
                                         
 Mcnemar's Test P-Value : < 2.2e-16      
                                         
            Sensitivity : 0.3780         
            Specificity : 0.9012         
         Pos Pred Value : 0.6646         
         Neg Pred Value : 0.7368         
             Prevalence : 0.3411         
         Detection Rate : 0.1290         
   Detection Prevalence : 0.1940         
      Balanced Accuracy : 0.6396         
                                         
       'Positive' Class : 1              
                                         

In [34]:
options(repr.plot.width = 8, repr.plot.height = 8)

ROC_ridge <- roc(
  response = employee_train$LeaveOrNot,
  predictor = predict(employee_ridge_max_AUC, newx = model_matrix_X_train)[,"s0"])

best_threshold <- coords(ROC_ridge, "best")["threshold"] %>% pull()
best_threshold

Setting levels: control = 0, case = 1

Setting direction: controls < cases



In [35]:
train_pred <- as.integer(predict(employee_ridge_max_AUC, newx = model_matrix_X_train, newy = matrix_Y_train) > best_threshold)

employee_confusion_matrix <- 
    confusionMatrix(
    data = as.factor(train_pred),
    reference = as.factor(employee_train$LeaveOrNot),
    positive = "1"
)
employee_confusion_matrix

Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1761  495
         1  385  616
                                         
               Accuracy : 0.7298         
                 95% CI : (0.7142, 0.745)
    No Information Rate : 0.6589         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.3842         
                                         
 Mcnemar's Test P-Value : 0.0002384      
                                         
            Sensitivity : 0.5545         
            Specificity : 0.8206         
         Pos Pred Value : 0.6154         
         Neg Pred Value : 0.7806         
             Prevalence : 0.3411         
         Detection Rate : 0.1891         
   Detection Prevalence : 0.3073         
      Balanced Accuracy : 0.6875         
                                         
       'Positive' Class : 1              
                                         

In [36]:
test_pred <- round(predict(employee_ridge_max_AUC, newx = model_matrix_X_test, newy = matrix_Y_test, type = "response"), 0)

employee_confusion_matrix_test <- 
    confusionMatrix(
    data = as.factor(test_pred),
    reference = as.factor(employee_test$LeaveOrNot),
    positive = "1"
)
employee_confusion_matrix_test

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 829 296
         1  78 193
                                         
               Accuracy : 0.7321         
                 95% CI : (0.708, 0.7552)
    No Information Rate : 0.6497         
    P-Value [Acc > NIR] : 2.698e-11      
                                         
                  Kappa : 0.344          
                                         
 Mcnemar's Test P-Value : < 2.2e-16      
                                         
            Sensitivity : 0.3947         
            Specificity : 0.9140         
         Pos Pred Value : 0.7122         
         Neg Pred Value : 0.7369         
             Prevalence : 0.3503         
         Detection Rate : 0.1383         
   Detection Prevalence : 0.1941         
      Balanced Accuracy : 0.6543         
                                         
       'Positive' Class : 1              
                                         

In [37]:
ridge_acc <- employee_confusion_matrix_test$overall['Accuracy']
ridge_acc

In [38]:
results <- results %>% add_row(model = "Ridge w/default threshold", accuracy = ridge_acc)
results

Unnamed: 0_level_0,model,accuracy
Unnamed: 0_level_1,<chr>,<dbl>
Accuracy,LASSO Selection OLS,0.730659
...2,LASSO Direct,0.715616
...3,OLS w/Forward Selection,0.7342407
...4,Ridge w/default threshold,0.7320917


In [39]:
test_pred <- as.integer(predict(employee_ridge_max_AUC, newx = model_matrix_X_test, newy = matrix_Y_test) > best_threshold)

employee_confusion_matrix_adj <- 
    confusionMatrix(
    data = as.factor(test_pred),
    reference = as.factor(employee_test$LeaveOrNot),
    positive = "1"
)
employee_confusion_matrix_adj

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 760 217
         1 147 272
                                          
               Accuracy : 0.7393          
                 95% CI : (0.7154, 0.7621)
    No Information Rate : 0.6497          
    P-Value [Acc > NIR] : 4.381e-13       
                                          
                  Kappa : 0.4076          
                                          
 Mcnemar's Test P-Value : 0.0002985       
                                          
            Sensitivity : 0.5562          
            Specificity : 0.8379          
         Pos Pred Value : 0.6492          
         Neg Pred Value : 0.7779          
             Prevalence : 0.3503          
         Detection Rate : 0.1948          
   Detection Prevalence : 0.3001          
      Balanced Accuracy : 0.6971          
                                          
       'Positive' Class : 1               
                              

In [40]:
ridge_acc_adj <- employee_confusion_matrix_adj$overall['Accuracy']
ridge_acc_adj

In [41]:
results <- results %>% add_row(model = "Ridge w/best threshold", accuracy = ridge_acc_adj)
results

Unnamed: 0_level_0,model,accuracy
Unnamed: 0_level_1,<chr>,<dbl>
Accuracy,LASSO Selection OLS,0.730659
...2,LASSO Direct,0.715616
...3,OLS w/Forward Selection,0.7342407
...4,Ridge w/default threshold,0.7320917
...5,Ridge w/best threshold,0.739255


# Stepwise Selection

In [42]:
# Initial (null) model with no predictors
null_model <- glm(LeaveOrNot ~ 1, data = employee_train, family = binomial)

# Full model with all predictors
full_model <- glm(LeaveOrNot ~ ., data = employee_train, family = binomial)

# Forward selection using AIC
employee_forward_sel <- step(null_model, 
                             scope = formula(full_model), 
                             direction = "both")
# summary
employee_forward_sel_summary <- summary(employee_forward_sel)

Start:  AIC=4182.49
LeaveOrNot ~ 1

                            Df Deviance    AIC
+ PaymentTier                2   3969.5 3975.5
+ Gender                     1   4036.9 4040.9
+ City                       2   4057.1 4063.1
+ JoiningYear                1   4073.0 4077.0
+ Education                  2   4112.9 4118.9
+ EverBenched                1   4160.6 4164.6
+ Age                        1   4167.9 4171.9
+ ExperienceInCurrentDomain  1   4174.4 4178.4
<none>                           4180.5 4182.5

Step:  AIC=3975.45
LeaveOrNot ~ PaymentTier

                            Df Deviance    AIC
+ Gender                     1   3897.0 3905.0
+ JoiningYear                1   3906.0 3914.0
+ City                       2   3923.6 3933.6
+ EverBenched                1   3946.2 3954.2
+ Education                  2   3947.4 3957.4
+ Age                        1   3958.2 3966.2
+ ExperienceInCurrentDomain  1   3962.8 3970.8
<none>                           3969.5 3975.5
- PaymentTier            

In [43]:
AIC_min <- which.min(employee_forward_sel_summary$deviance)
selected_var <- names(coef(employee_forward_sel, AIC_min))[-1]
selected_var

In [44]:
# generate model
employee_log_model <- glm(LeaveOrNot ~., 
                         data = employee_train,
                         family = binomial
                         )
summary(employee_log_model)


Call:
glm(formula = LeaveOrNot ~ ., family = binomial, data = employee_train)

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)               -3.885e+02  4.618e+01  -8.412  < 2e-16 ***
EducationMasters           6.557e-01  1.131e-01   5.797 6.77e-09 ***
EducationPHD               1.274e-01  2.216e-01   0.575  0.56528    
JoiningYear                1.932e-01  2.293e-02   8.427  < 2e-16 ***
CityNew Delhi             -5.418e-01  1.173e-01  -4.620 3.84e-06 ***
CityPune                   5.115e-01  1.024e-01   4.995 5.89e-07 ***
PaymentTier2               5.363e-01  1.890e-01   2.838  0.00454 ** 
PaymentTier3              -2.025e-01  1.730e-01  -1.171  0.24180    
Age                       -3.554e-02  8.534e-03  -4.165 3.12e-05 ***
GenderMale                -8.632e-01  8.502e-02 -10.153  < 2e-16 ***
EverBenchedYes             5.847e-01  1.258e-01   4.648 3.35e-06 ***
ExperienceInCurrentDomain -6.937e-02  2.618e-02  -2.650  0.00806 ** 
---
Signi

In [45]:
# generate model
employee_log_model <- glm(LeaveOrNot ~., 
                         data = employee_train,
                         family = binomial
                         )
summary(employee_log_model)


Call:
glm(formula = LeaveOrNot ~ ., family = binomial, data = employee_train)

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)               -3.885e+02  4.618e+01  -8.412  < 2e-16 ***
EducationMasters           6.557e-01  1.131e-01   5.797 6.77e-09 ***
EducationPHD               1.274e-01  2.216e-01   0.575  0.56528    
JoiningYear                1.932e-01  2.293e-02   8.427  < 2e-16 ***
CityNew Delhi             -5.418e-01  1.173e-01  -4.620 3.84e-06 ***
CityPune                   5.115e-01  1.024e-01   4.995 5.89e-07 ***
PaymentTier2               5.363e-01  1.890e-01   2.838  0.00454 ** 
PaymentTier3              -2.025e-01  1.730e-01  -1.171  0.24180    
Age                       -3.554e-02  8.534e-03  -4.165 3.12e-05 ***
GenderMale                -8.632e-01  8.502e-02 -10.153  < 2e-16 ***
EverBenchedYes             5.847e-01  1.258e-01   4.648 3.35e-06 ***
ExperienceInCurrentDomain -6.937e-02  2.618e-02  -2.650  0.00806 ** 
---
Signi

In [46]:
# Using model to predict 
predict_model <- predict(employee_log_model, newdata = employee_test, type = "response")
predicted_classes <- ifelse(predict_model > 0.5, 1, 0)

In [47]:
employee_confusion_matrix_test <- 
    confusionMatrix(
    data = as.factor(predicted_classes),
    reference = as.factor(employee_test$LeaveOrNot),
    positive = "1"
)
employee_confusion_matrix_test

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 823 289
         1  84 200
                                          
               Accuracy : 0.7328          
                 95% CI : (0.7088, 0.7559)
    No Information Rate : 0.6497          
    P-Value [Acc > NIR] : 1.816e-11       
                                          
                  Kappa : 0.3502          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.4090          
            Specificity : 0.9074          
         Pos Pred Value : 0.7042          
         Neg Pred Value : 0.7401          
             Prevalence : 0.3503          
         Detection Rate : 0.1433          
   Detection Prevalence : 0.2034          
      Balanced Accuracy : 0.6582          
                                          
       'Positive' Class : 1               
                              

In [48]:
step_acc <- employee_confusion_matrix_test$overall['Accuracy']
step_acc

In [49]:
results <- results %>% add_row(model = "OLS w/Stepwise Selection", accuracy = step_acc)
results

Unnamed: 0_level_0,model,accuracy
Unnamed: 0_level_1,<chr>,<dbl>
Accuracy,LASSO Selection OLS,0.730659
...2,LASSO Direct,0.715616
...3,OLS w/Forward Selection,0.7342407
...4,Ridge w/default threshold,0.7320917
...5,Ridge w/best threshold,0.739255
...6,OLS w/Stepwise Selection,0.732808


# Comparison

The performance of the model is fairly consistent with all approaches, however, the Ridge model seems to perform the best.