In [1]:
## Importing the required Packages
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime, date, time
import joblib
# to display all columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import figure
from pylab import rcParams

In [3]:
import copy
import statistics
import scipy.stats as stats
import scipy
from sklearn.linear_model import LogisticRegression

#### loading pickled dataset

In [21]:
#### loading pickled dataset ####
df_train_fs = joblib.load("df_train_fs.pkl")
df_train_fs.shape

(54808, 46)

In [22]:
#### column names
df_train_fs.columns

Index(['index', 'employee_id', 'service_yrs', 'kpi', 'awarded',
       'avg_training_score', 'promoted', 'previous_rating_nan', 'f_age_bins',
       'ohe_dept_Analytics', 'ohe_dept_Finance', 'ohe_dept_HR',
       'ohe_dept_Legal', 'ohe_dept_Operations', 'ohe_dept_Procurement',
       'ohe_dept_R&D', 'ohe_dept_Sales & Marketing', 'ohe_dept_Technology',
       'f_dept_train_score_mean', 'f_dept_train_score_median',
       'f_dept_train_score_min', 'f_dept_train_score_max', 'f_dept_age_median',
       'f_dept_service_max', 'f_region_train_score_mean',
       'f_region_train_score_median', 'f_region_train_score_min',
       'f_region_train_score_max', 'f_region_train_score_std',
       'f_region_age_median', 'f_region_age_min', 'f_region_age_max',
       'f_region_service_mean', 'f_region_service_max', 'ohe_edu_Bachelor's',
       'ohe_edu_Below Secondary', 'ohe_edu_Master's & above', 'f_edu_age_std',
       'f_trainings_oneplus', 'f_rating_edu_cnt', 'f_rating_dept_cnt',
       'f_dept_edu

#### Handling Class Imbalance

In [23]:
# Target value count
df_train_fs['promoted'].value_counts()

0    50140
1     4668
Name: promoted, dtype: int64

#### 1. Under Sampling

In [24]:
# undersampling technique
np.random.seed(1234)
df_undersample = df_train_fs[df_train_fs['promoted'] == 0].sample(20000)
df_undersample = df_undersample.append(df_train_fs[df_train_fs['promoted'] == 1])
df_undersample.shape 

(24668, 46)

In [25]:
# drop index and source_data column
df_undersample = df_undersample.drop(labels = ['index', 'employee_id'], axis = 1)
# removing duplicate records
df_undersample = df_undersample.drop_duplicates(keep = 'first')
df_undersample.shape

(23812, 44)

#### Logistic Regression Model parameter optimizing

In [9]:
%%time
#### grid search for logistic regression ####
#### identifying hyperparameters of the logistic regression Model #####
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

# define model and parameters
logit_r = LogisticRegression(random_state= 0)
solvers = ['liblinear', 'lbfgs', 'newton-cg']
penalty = ['l1', 'l2']
c_values = [0.01, 1.0, 5, 10]
max_iter = [200, 400]

# define grid search
grid = dict(solver=solvers,max_iter=max_iter, penalty=penalty,C=c_values)  

# StratifiedKfold
from sklearn.model_selection import StratifiedKFold 
strkfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

# Grid search
grid_search = GridSearchCV(estimator=logit_r, param_grid=grid, n_jobs=-1, cv=strkfold, scoring='f1',error_score=0)
grid_result = grid_search.fit(df_undersample.drop(labels = ['promoted'], axis = 1), df_undersample['promoted'])

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.514887 using {'C': 10, 'max_iter': 200, 'penalty': 'l2', 'solver': 'newton-cg'}
0.490492 (0.009399) with: {'C': 0.01, 'max_iter': 200, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 0.01, 'max_iter': 200, 'penalty': 'l1', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 0.01, 'max_iter': 200, 'penalty': 'l1', 'solver': 'newton-cg'}
0.496868 (0.009168) with: {'C': 0.01, 'max_iter': 200, 'penalty': 'l2', 'solver': 'liblinear'}
0.498306 (0.019066) with: {'C': 0.01, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs'}
0.503133 (0.009422) with: {'C': 0.01, 'max_iter': 200, 'penalty': 'l2', 'solver': 'newton-cg'}
0.490492 (0.009399) with: {'C': 0.01, 'max_iter': 400, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 0.01, 'max_iter': 400, 'penalty': 'l1', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 0.01, 'max_iter': 400, 'penalty': 'l1', 'solver': 'newton-cg'}
0.496868 (0.009168) with: {'C': 0.01, 'max_iter': 400, 'penalty': 'l2

#### Logistic Regression Model with Optimized Parameters

In [26]:
%%time
#### Logistic Regression Model ####
logit_r = LogisticRegression(C= 10, max_iter = 200, penalty= 'l2', solver= 'newton-cg', random_state= 0 ) 

Wall time: 0 ns


#### XGBoost Classifier Model Parameter Optimizing 

In [11]:
# XGBoost parameter optimizing
from sklearn.model_selection import GridSearchCV

# parameter grid
param_grid = {
              "learning_rate": [0.05, 0.1, 0.2],
              "min_child_weight": [5, 8],
              "gamma" : [0],
              "reg_alpha": [0.01, 0.1, 1],
              "subsample": [0.5, 0.8],
              'n_estimators': [400, 600],
              'max_depth' : [5, 8],
              "colsample_bytree" : [0.8],
              "eval_metric" : ['error']
              }

# StratifiedKfold
from sklearn.model_selection import StratifiedKFold 
strkfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

### Creation of Base model ###
from xgboost import XGBClassifier
xgbc = XGBClassifier(booster = 'gbtree', random_state = 0 )

# Grid Search 
grid_search = GridSearchCV(estimator = xgbc, param_grid = param_grid, cv = strkfold, scoring='f1', error_score=0, n_jobs = -1, verbose = 2)

In [12]:
%%time
# Fitting the grid_search to the model ####
grid_search.fit(df_undersample.drop(labels = ['promoted'], axis = 1), df_undersample['promoted'])

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 16.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 92.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 188.4min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 321.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 366.0min finished


Wall time: 6h 7min 3s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             error_score=0,
             estimator=XGBClassifier(base_score=None, booster='gbtree',
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_w...
                                     validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.8], 'eval_metric': ['error'],
                         'gamma': [0], 'learning_rate': [0.05, 0.1, 0.2],
                         'max_depth': [5, 8], 'min_child_weight': [5, 8

In [13]:
# Grid Search Best Params
grid_search.best_params_

{'colsample_bytree': 0.8,
 'eval_metric': 'error',
 'gamma': 0,
 'learning_rate': 0.05,
 'max_depth': 8,
 'min_child_weight': 8,
 'n_estimators': 400,
 'reg_alpha': 0.1,
 'subsample': 0.8}

#### XGBoost Model with Optimized Parameters

In [27]:
%%time
# fit xgboost model with optimized parameters 
from xgboost import XGBClassifier
xgbc = XGBClassifier(random_state = 0, colsample_bytree = 0.8, eval_metric = 'error', gamma = 0,  learning_rate = 0.05, max_depth = 8,
                      min_child_weight = 8, n_estimators = 400, reg_alpha = 0.1,  subsample = 0.8 )

Wall time: 0 ns


#### LightGBM Classifier Model Parameter Optimizing 

In [15]:
#### identifying hyperparameters for light gbm #####
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
# parameter grid
param_grid = {
    'num_leaves': [100, 200] ,
    'n_estimators': [400, 600, 800, 1000] ,
    'max_depth':[5, 10],
    "importance_type":['split', 'gain'],
    "learning_rate": [0.05, 0.1, 0.2],
    'subsample_for_bin' : [8000, 16000],
    'min_child_samples' : [100, 200],
    'colsample_bytree' : [0.6, 0.8],
    'reg_alpha' : [0.001, 0.01, 0.1]
        }
# StratifiedKfold
from sklearn.model_selection import StratifiedKFold 
strkfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

### Creation of Base model ###
import lightgbm as lgbm
lightgbm = lgbm.LGBMClassifier(boosting_type= 'goss', objective= 'binary', random_state= 0 )

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = lightgbm, param_grid = param_grid, cv = strkfold,  scoring='f1', n_jobs = -1, verbose = 2)

In [16]:
%%time
#### Fitting the grid_search to the model ####
grid_search.fit(df_undersample.drop(labels = ['promoted'], axis = 1), df_undersample['promoted'])

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 20.9min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 36.4min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 51.3min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 72.9min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 95.6min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed: 122.0min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 155.1min
[Parallel(n_jobs=-1)]: Done 4893 tasks      | elapsed: 189.8min
[Parallel(n_jobs=-1)]: Done 5824 tasks      | elapsed: 227.8min
[Parallel(n_jobs=-1)]: Done 6837 tasks      | elapsed: 272.9min
[Parallel(n_jobs=-1)]: Done 7930 tasks      | elapsed: 321.5min
[Parallel(n_jobs=-1)]: Done 9105 tasks  

Wall time: 8h 6min 1s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             error_score=nan,
             estimator=LGBMClassifier(boosting_type='goss', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31,
                                      objective='binary'...
             param_grid={'colsample_bytree': [0.6, 0.8],
                         'importance_type': ['split', 'gain'],
                         'learning_rate': [0.05, 0.1, 0.2],
                         'max_depth': [5, 10], 'min_child_samples': [100, 200],
                         'n_estimators':

In [17]:
### Getting the Best Estimator #####
grid_search.best_params_
#{'colsample_bytree': 0.8,  'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': 10, 'min_child_samples': 200,
 # n_estimators': 1000,  'num_leaves': 100, 'reg_alpha': 0.1, 'subsample_for_bin': 16000}

{'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.05,
 'max_depth': 10,
 'min_child_samples': 200,
 'n_estimators': 1000,
 'num_leaves': 100,
 'reg_alpha': 0.1,
 'subsample_for_bin': 16000}

#### LightGBM Model with Optimized Parameters

In [28]:
%%time
#### implementation of light gbm #####
import lightgbm as lgbm
lightgbm = lgbm.LGBMClassifier(boosting_type= 'goss', objective= 'binary', random_state= 0, importance_type = 'split', colsample_bytree = 0.05, max_depth= 10, min_child_samples = 200, n_estimators= 1000, num_leaves= 100, reg_alpha= 0.1, subsample_for_bin= 16000 )

Wall time: 0 ns


### Voting Classification

In [29]:
# ensemble voting classification
from sklearn.ensemble import VotingClassifier
models_list = []
models_list.append(('Logistic Regression', logit_r))
models_list.append(('XGBoost', xgbc))
models_list.append(('LightGBM', lightgbm))

In [30]:
%%time
# voting classification fit
ensemble_m = VotingClassifier(estimators= models_list, voting= 'soft')
ensemble_m.fit(df_undersample.drop(labels = ['promoted'], axis = 1), df_undersample['promoted'])

Wall time: 44.7 s


VotingClassifier(estimators=[('Logistic Regression',
                              LogisticRegression(C=10, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=200,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0,
                                                 solver='newton-cg', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('XGBoost',
                              XGBClassifier(base_score=None, booster=None,
                                            colsampl...
                                             importance_type='split',
                               

#### pickling the models

In [None]:
# Logistic Regression Model 
joblib.dump(logit_r, "hr_logistic_model.pkl")

In [None]:
# XGBoost Classifier model ####
joblib.dump(xgbc, "xgboost_model.pkl")

In [None]:
# Light GBM Classifier model ####
joblib.dump(lightgbm, "lightgbm_model.pkl")

In [31]:
# Voting Classification Ensemble 
joblib.dump(ensemble_m, "Voting_class_ensemlbe_model.pkl")

['Voting_class_ensemlbe_model.pkl']

## Kindly comment on the approach for improvement of the model & learning!!!