In [1]:
## Importing the required Packages
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime, date, time
import joblib
# to display all columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import figure
from pylab import rcParams

In [3]:
import copy
import statistics
import scipy.stats as stats
from scipy.stats import chi2_contingency, chisquare, lognorm, kstest, shapiro, normaltest
from category_encoders import TargetEncoder
import scipy
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

#### loading pickled train dataset

In [4]:
#### loading pickled train dataset ####
df_train_eda = joblib.load("df_train_eda.pkl")
df_train_eda.shape

(54808, 18)

In [5]:
df_train_eda.columns

Index(['index', 'employee_id', 'department', 'region', 'education', 'gender',
       'trainings_attended', 'age', 'previous_rating', 'service_yrs', 'kpi',
       'awarded', 'avg_training_score', 'promoted', 'region_rare',
       'education_nan', 'gender_le', 'previous_rating_nan'],
      dtype='object')

#### feature engineering 

#### 1. f_service_bins

In [6]:
#### creation of service yrs bins based on quantiles ####
service_bins = [0.999, 3.0, 5.0, 7.0, 37.0]
service_labels = [1,2,3,4]
df_train_eda['f_service_bins'] = pd.cut(df_train_eda['service_yrs'], bins = service_bins, labels = service_labels ).astype(int)
print(df_train_eda.shape)

(54808, 19)


#### 2. f_age_bins

In [7]:
#### creation of employee aged bins based on quantiles ####
age_bins = [19.999, 29.0, 33.0, 39.0, 60.0]
age_labels = [1,2,3,4]
df_train_eda['f_age_bins'] = pd.cut(df_train_eda['age'], bins = age_bins, labels = age_labels).astype(int)
print(df_train_eda.shape)

(54808, 20)


#### 3. department_ohe

In [8]:
# department one hot encoding
ohe_department = pd.get_dummies(df_train_eda['department'], prefix= 'ohe_dept')
# adding the onehot encoding columns to X_train dataset based on index
df_train_eda = df_train_eda.merge(ohe_department, left_index= True, right_index= True)
df_train_eda.shape

(54808, 29)

#### 4. department overview

In [9]:
# department count 
df_department = pd.DataFrame(df_train_eda.groupby(['department']).size(), columns = ['f_dept_cnt']) 
# average training score
df_department['f_dept_train_score_mean'] = df_train_eda.groupby(['department'])['avg_training_score'].mean()
df_department['f_dept_train_score_median'] = df_train_eda.groupby(['department'])['avg_training_score'].median()
df_department['f_dept_train_score_min'] = df_train_eda.groupby(['department'])['avg_training_score'].min()
df_department['f_dept_train_score_max'] = df_train_eda.groupby(['department'])['avg_training_score'].max()
df_department['f_dept_train_score_std'] = df_train_eda.groupby(['department'])['avg_training_score'].std()
# age
df_department['f_dept_age_mean'] = df_train_eda.groupby(['department'])['age'].mean()
df_department['f_dept_age_median'] = df_train_eda.groupby(['department'])['age'].median()
df_department['f_dept_age_min'] = df_train_eda.groupby(['department'])['age'].min()
df_department['f_dept_age_max'] = df_train_eda.groupby(['department'])['age'].max()
df_department['f_dept_age_std'] = df_train_eda.groupby(['department'])['age'].std()

# service years
df_department['f_dept_service_mean'] = df_train_eda.groupby(['department'])['service_yrs'].mean()
df_department['f_dept_service_median'] = df_train_eda.groupby(['department'])['service_yrs'].median()
df_department['f_dept_service_min'] = df_train_eda.groupby(['department'])['service_yrs'].min()
df_department['f_dept_service_max'] = df_train_eda.groupby(['department'])['service_yrs'].max()
df_department['f_dept_service_std'] = df_train_eda.groupby(['department'])['service_yrs'].std()

df_department = df_department.reset_index()
print(df_department.shape)

# adding new columns
df_train_eda = pd.merge(df_train_eda, df_department, how = 'left', on= ['department'] )
df_train_eda.shape

(9, 17)


(54808, 45)

#### 5. region overview

In [10]:
# region count 
df_region = pd.DataFrame(df_train_eda.groupby(['region_rare']).size(), columns = ['f_region_cnt'])
# average training score
df_region['f_region_train_score_mean'] = df_train_eda.groupby(['region_rare'])['avg_training_score'].mean()
df_region['f_region_train_score_median'] = df_train_eda.groupby(['region_rare'])['avg_training_score'].median()
df_region['f_region_train_score_min'] = df_train_eda.groupby(['region_rare'])['avg_training_score'].min()
df_region['f_region_train_score_max'] = df_train_eda.groupby(['region_rare'])['avg_training_score'].max()
df_region['f_region_train_score_std'] = df_train_eda.groupby(['region_rare'])['avg_training_score'].std()
# age
df_region['f_region_age_mean'] = df_train_eda.groupby(['region_rare'])['age'].mean()
df_region['f_region_age_median'] = df_train_eda.groupby(['region_rare'])['age'].median()
df_region['f_region_age_min'] = df_train_eda.groupby(['region_rare'])['age'].min()
df_region['f_region_age_max'] = df_train_eda.groupby(['region_rare'])['age'].max()
df_region['f_region_age_std'] = df_train_eda.groupby(['region_rare'])['age'].std()
# service years
df_region['f_region_service_mean'] = df_train_eda.groupby(['region_rare'])['service_yrs'].mean()
df_region['f_region_service_median'] = df_train_eda.groupby(['region_rare'])['service_yrs'].median()
df_region['f_region_service_min'] = df_train_eda.groupby(['region_rare'])['service_yrs'].min()
df_region['f_region_service_max'] = df_train_eda.groupby(['region_rare'])['service_yrs'].max()
df_region['f_region_service_std'] = df_train_eda.groupby(['region_rare'])['service_yrs'].std()

df_region = df_region.reset_index()
print(df_region.shape)

# adding new columns
df_train_eda = pd.merge(df_train_eda, df_region, how = 'left', on= ['region_rare'] )
df_train_eda.shape

(27, 17)


(54808, 61)

#### 6. education_ohe

In [11]:
# education one hot encoding
ohe_education = pd.get_dummies(df_train_eda['education_nan'], prefix= 'ohe_edu')
# adding the onehot encoding columns to X_train dataset based on index
df_train_eda = df_train_eda.merge(ohe_education, left_index= True, right_index= True)
df_train_eda.shape

(54808, 64)

#### 7. education overview

In [12]:
# education count 
df_education = pd.DataFrame(df_train_eda.groupby(['education_nan']).size(), columns = ['f_edu_cnt'])
# average training score
df_education['f_edu_train_score_mean'] = df_train_eda.groupby(['education_nan'])['avg_training_score'].mean()
df_education['f_edu_train_score_median'] = df_train_eda.groupby(['education_nan'])['avg_training_score'].median()
df_education['f_edu_train_score_min'] = df_train_eda.groupby(['education_nan'])['avg_training_score'].min()
df_education['f_edu_train_score_max'] = df_train_eda.groupby(['education_nan'])['avg_training_score'].max()
df_education['f_edu_train_score_std'] = df_train_eda.groupby(['education_nan'])['avg_training_score'].std()
# age
df_education['f_edu_age_mean'] = df_train_eda.groupby(['education_nan'])['age'].mean()
df_education['f_edu_age_median'] = df_train_eda.groupby(['education_nan'])['age'].median()
df_education['f_edu_age_min'] = df_train_eda.groupby(['education_nan'])['age'].min()
df_education['f_edu_age_max'] = df_train_eda.groupby(['education_nan'])['age'].max()
df_education['f_edu_age_std'] = df_train_eda.groupby(['education_nan'])['age'].std()
# service years
df_education['f_edu_service_mean'] = df_train_eda.groupby(['education_nan'])['service_yrs'].mean()
df_education['f_edu_service_median'] = df_train_eda.groupby(['education_nan'])['service_yrs'].median()
df_education['f_edu_service_min'] = df_train_eda.groupby(['education_nan'])['service_yrs'].min()
df_education['f_edu_service_max'] = df_train_eda.groupby(['education_nan'])['service_yrs'].max()
df_education['f_edu_service_std'] = df_train_eda.groupby(['education_nan'])['service_yrs'].std()

df_education = df_education.reset_index()
print(df_education.shape)

# adding new columns
df_train_eda = pd.merge(df_train_eda, df_education, how = 'left', on= ['education_nan'] )
df_train_eda.shape

(3, 17)


(54808, 80)

#### 8. f_service_left

In [13]:
#### no of years left for retirement ####
service_left = list(60 - df_train_eda['age'])
df_train_eda['f_service_left'] = service_left
df_train_eda.shape

(54808, 81)

#### 9. f_trainings_oneplus

In [14]:
# more thean one training attended
df_train_eda['f_trainings_oneplus'] = np.where(df_train_eda['trainings_attended'] == 1, 0, 1)
df_train_eda.shape

(54808, 82)

#### 10. f_rating_edu_cnt

In [15]:
# rating & education combination count 
df_rating_edu_cnt = pd.DataFrame(df_train_eda.groupby(['previous_rating_nan', 'education_nan']).size(), columns = ['f_rating_edu_cnt'])
df_rating_edu_cnt = df_rating_edu_cnt.reset_index()
print(df_rating_edu_cnt.shape)

# adding new column 
df_train_eda = pd.merge(df_train_eda, df_rating_edu_cnt, how = 'left', on= ['previous_rating_nan', 'education_nan'] )
df_train_eda.shape

(18, 3)


(54808, 83)

#### 11. f_rating_dept_cnt

In [16]:
# rating & department combination count 
df_rating_dept_cnt = pd.DataFrame(df_train_eda.groupby(['previous_rating_nan', 'department']).size(), columns = ['f_rating_dept_cnt'])
df_rating_dept_cnt = df_rating_dept_cnt.reset_index()
print(df_rating_dept_cnt.shape)

# adding new column 
df_train_eda = pd.merge(df_train_eda, df_rating_dept_cnt, how = 'left', on= ['previous_rating_nan', 'department'] )
df_train_eda.shape

(54, 3)


(54808, 84)

#### 12. f_dept_edu_cnt

In [17]:
# department & education combination count 
df_dept_edu_cnt = pd.DataFrame(df_train_eda.groupby(['department', 'education_nan']).size(), columns = ['f_dept_edu_cnt'])
df_dept_edu_cnt = df_dept_edu_cnt.reset_index()
print(df_dept_edu_cnt.shape)

# adding new column 
df_train_eda = pd.merge(df_train_eda, df_dept_edu_cnt, how = 'left', on= ['department', 'education_nan'] )
df_train_eda.shape

(24, 3)


(54808, 85)

#### 13. f_dept_region_cnt

In [18]:
# department & region combination count 
df_dept_region_cnt = pd.DataFrame(df_train_eda.groupby(['department', 'region_rare']).size(), columns = ['f_dept_region_cnt'])
df_dept_region_cnt = df_dept_region_cnt.reset_index()
print(df_dept_region_cnt.shape)

# adding new column 
df_train_eda = pd.merge(df_train_eda, df_dept_region_cnt, how = 'left', on= ['department', 'region_rare'] )
df_train_eda.shape

(240, 3)


(54808, 86)

#### 14. f_region_edu_cnt

In [19]:
# region & education combination count 
df_region_edu_cnt = pd.DataFrame(df_train_eda.groupby(['region_rare', 'education_nan']).size(), columns = ['f_region_edu_cnt'])
df_region_edu_cnt = df_region_edu_cnt.reset_index()
print(df_region_edu_cnt.shape)

# adding new column 
df_train_eda = pd.merge(df_train_eda, df_region_edu_cnt, how = 'left', on= ['region_rare', 'education_nan'] )
df_train_eda.shape

(72, 3)


(54808, 87)

#### 15. f_region_rating_cnt

In [20]:
# region & education combination count 
df_region_rate_cnt = pd.DataFrame(df_train_eda.groupby(['region_rare', 'previous_rating_nan']).size(), columns = ['f_region_rating_cnt'])
df_region_rate_cnt = df_region_rate_cnt.reset_index()
print(df_region_rate_cnt.shape)

# adding new column 
df_train_eda = pd.merge(df_train_eda, df_region_rate_cnt, how = 'left', on= ['region_rare', 'previous_rating_nan'] )
df_train_eda.shape

(162, 3)


(54808, 88)

#### 16. f_region_target_encode

In [21]:
# region target encoder
region_target_encode = TargetEncoder(cols = 'region_rare')
region_target_encode.fit(df_train_eda['region_rare'], df_train_eda['promoted'])
# adding new column
df_train_eda['f_region_target_encode'] = region_target_encode.transform(df_train_eda['region_rare']) 
df_train_eda.shape

(54808, 89)

#### 17. f_performance

In [22]:
# add new column
df_train_eda['f_performance'] = (df_train_eda['kpi'] + df_train_eda['previous_rating_nan'])
df_train_eda.shape

(54808, 90)

In [23]:
# features names
df_train_eda.columns

Index(['index', 'employee_id', 'department', 'region', 'education', 'gender',
       'trainings_attended', 'age', 'previous_rating', 'service_yrs', 'kpi',
       'awarded', 'avg_training_score', 'promoted', 'region_rare',
       'education_nan', 'gender_le', 'previous_rating_nan', 'f_service_bins',
       'f_age_bins', 'ohe_dept_Analytics', 'ohe_dept_Finance', 'ohe_dept_HR',
       'ohe_dept_Legal', 'ohe_dept_Operations', 'ohe_dept_Procurement',
       'ohe_dept_R&D', 'ohe_dept_Sales & Marketing', 'ohe_dept_Technology',
       'f_dept_cnt', 'f_dept_train_score_mean', 'f_dept_train_score_median',
       'f_dept_train_score_min', 'f_dept_train_score_max',
       'f_dept_train_score_std', 'f_dept_age_mean', 'f_dept_age_median',
       'f_dept_age_min', 'f_dept_age_max', 'f_dept_age_std',
       'f_dept_service_mean', 'f_dept_service_median', 'f_dept_service_min',
       'f_dept_service_max', 'f_dept_service_std', 'f_region_cnt',
       'f_region_train_score_mean', 'f_region_train_score_me

#### pickling

In [24]:
#### pickling the df_train_eda dataset ####
print(df_train_eda.shape)
joblib.dump(df_train_eda, "df_train_fe.pkl")

(54808, 90)


['df_train_fe.pkl']

In [25]:
# feature engineering 
print(joblib.dump((service_bins, service_labels), "fe_1.pkl"))
print(joblib.dump((age_bins, age_labels), "fe_2.pkl"))
print(joblib.dump((df_department, df_region, df_education, df_rating_edu_cnt, df_rating_dept_cnt, df_dept_edu_cnt, df_dept_region_cnt, df_region_edu_cnt, df_region_rate_cnt), "fe_3.pkl" ))

['fe_1.pkl']
['fe_2.pkl']
['fe_3.pkl']


In [26]:
# region target encoding
print(joblib.dump(region_target_encode, "region_target_encode.pkl"))

['region_target_encode.pkl']
