# Imports

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# Wrangle

Title: IBM HR Analytics Employees Attrition & Performance

Acquired from Kaggle

Data is Synthetic and was Constructed by Data Scientists at IBM

In [2]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


# Problem Statement

* ACME company has hired you as a consultant 
* ACME is currently experiencing an above average attrition rate
* In an effort to address this, you have been tasked with developing a model to predict the top 20% of employees who are at risk for attrition.
* Once identified, ACME can take steps to prevent those employees from attriting

# Goal

* Build a model, using DataRobot, to predict attrition in company employees

* Models will be evaluated using the max return on investment (ROI) that results from their predictions 

* A model will be considered successful if its predictions result in a positive maximum ROI for the company on holdout data


# Prepare
* Dropped non-informative columns
* Changed ‘survey-like’ features from numeric to categorical variables
* Removed outliers


In [3]:
df.shape

(1470, 35)

In [4]:
df.Attrition.value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

In [5]:
# for column in df.columns:
    
#     print(column)
#     print(df[f'{column}'].value_counts())
#     print('')

## Dropped non-informative columns

**Contains no useful information**<br/>
EmployeeNumber

**Value is the same for all employees**<br/>
EmployeeCount<br/>
Over18<br/>
StandardHours<br/>

In [6]:
for column in df.columns:
    if column in ('EmployeeCount','EmployeeNumber','Over18','StandardHours'):
        df = df.drop(columns = f'{column}')

In [7]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [8]:
df.shape

(1470, 31)

## Changed the following ‘survey-like' features from numeric to categorical variables 

* Education

* EnvironmentSatisfaction

* JobInvolvement

* JobSatisFaction

* PerformanceRating

* RelationshipSatisfaction

* WorkLifeBalance

* JobLevel

* StockOptionLevel

In [9]:
df.Education = df.Education.apply(lambda number : str(number).replace('1','no_college'))
df.Education = df.Education.apply(lambda number : str(number).replace('2','some_college'))
df.Education = df.Education.apply(lambda number : str(number).replace('3','bachelors_degree'))
df.Education = df.Education.apply(lambda number : str(number).replace('4','masters_degree'))
df.Education = df.Education.apply(lambda number : str(number).replace('5','Doctorate'))

In [10]:
df.Education.value_counts()

bachelors_degree    572
masters_degree      398
some_college        282
no_college          170
Doctorate            48
Name: Education, dtype: int64

In [11]:
df.EnvironmentSatisfaction = df.EnvironmentSatisfaction.apply(lambda number : str(number).replace('1','low'))
df.EnvironmentSatisfaction = df.EnvironmentSatisfaction.apply(lambda number : str(number).replace('2','meduim'))
df.EnvironmentSatisfaction = df.EnvironmentSatisfaction.apply(lambda number : str(number).replace('3','high'))
df.EnvironmentSatisfaction = df.EnvironmentSatisfaction.apply(lambda number : str(number).replace('4','very_high'))

In [12]:
df.EnvironmentSatisfaction.value_counts()

high         453
very_high    446
meduim       287
low          284
Name: EnvironmentSatisfaction, dtype: int64

In [13]:
df.JobInvolvement = df.JobInvolvement.apply(lambda number : str(number).replace('1','low'))
df.JobInvolvement = df.JobInvolvement.apply(lambda number : str(number).replace('2','meduim'))
df.JobInvolvement = df.JobInvolvement.apply(lambda number : str(number).replace('3','high'))
df.JobInvolvement = df.JobInvolvement.apply(lambda number : str(number).replace('4','very_high'))

In [14]:
df.JobInvolvement.value_counts()

high         868
meduim       375
very_high    144
low           83
Name: JobInvolvement, dtype: int64

In [15]:
df.JobSatisfaction = df.JobSatisfaction.apply(lambda number : str(number).replace('1','low'))
df.JobSatisfaction = df.JobSatisfaction.apply(lambda number : str(number).replace('2','meduim'))
df.JobSatisfaction = df.JobSatisfaction.apply(lambda number : str(number).replace('3','high'))
df.JobSatisfaction = df.JobSatisfaction.apply(lambda number : str(number).replace('4','very_high'))

In [16]:
df.JobSatisfaction.value_counts()

very_high    459
high         442
low          289
meduim       280
Name: JobSatisfaction, dtype: int64

In [17]:
df.PerformanceRating = df.PerformanceRating.apply(lambda number : str(number).replace('1','low'))
df.PerformanceRating = df.PerformanceRating.apply(lambda number : str(number).replace('2','good'))
df.PerformanceRating = df.PerformanceRating.apply(lambda number : str(number).replace('3','Excellent'))
df.PerformanceRating = df.PerformanceRating.apply(lambda number : str(number).replace('4','Outstanding'))

In [18]:
df.PerformanceRating.value_counts()

Excellent      1244
Outstanding     226
Name: PerformanceRating, dtype: int64

In [19]:
df.RelationshipSatisfaction = df.RelationshipSatisfaction.apply(lambda number : str(number).replace('1','low'))
df.RelationshipSatisfaction = df.RelationshipSatisfaction.apply(lambda number : str(number).replace('2','meduim'))
df.RelationshipSatisfaction = df.RelationshipSatisfaction.apply(lambda number : str(number).replace('3','high'))
df.RelationshipSatisfaction = df.RelationshipSatisfaction.apply(lambda number : str(number).replace('4','very_high'))

In [20]:
df.RelationshipSatisfaction.value_counts()

high         459
very_high    432
meduim       303
low          276
Name: RelationshipSatisfaction, dtype: int64

In [21]:
df.WorkLifeBalance = df.WorkLifeBalance.apply(lambda number : str(number).replace('1','bad'))
df.WorkLifeBalance = df.WorkLifeBalance.apply(lambda number : str(number).replace('2','good'))
df.WorkLifeBalance = df.WorkLifeBalance.apply(lambda number : str(number).replace('3','better'))
df.WorkLifeBalance = df.WorkLifeBalance.apply(lambda number : str(number).replace('4','best'))

In [22]:
df.WorkLifeBalance.value_counts()

better    893
good      344
best      153
bad        80
Name: WorkLifeBalance, dtype: int64

In [23]:
df.JobLevel = df.JobLevel.apply(lambda number : str(number).replace('1','one'))
df.JobLevel = df.JobLevel.apply(lambda number : str(number).replace('2','two'))
df.JobLevel = df.JobLevel.apply(lambda number : str(number).replace('3','three'))
df.JobLevel = df.JobLevel.apply(lambda number : str(number).replace('4','four'))
df.JobLevel = df.JobLevel.apply(lambda number : str(number).replace('5','five'))

In [24]:
df.JobLevel.value_counts()

one      543
two      534
three    218
four     106
five      69
Name: JobLevel, dtype: int64

In [25]:
df.StockOptionLevel = df.StockOptionLevel.apply(lambda number : str(number).replace('0','zero'))
df.StockOptionLevel = df.StockOptionLevel.apply(lambda number : str(number).replace('1','one'))
df.StockOptionLevel = df.StockOptionLevel.apply(lambda number : str(number).replace('2','two'))
df.StockOptionLevel = df.StockOptionLevel.apply(lambda number : str(number).replace('3','three'))

In [26]:
df.StockOptionLevel.value_counts()

zero     631
one      596
two      158
three     85
Name: StockOptionLevel, dtype: int64

## Removing Outliers 
* After puting the data with the above transformations into DataRobot it identified the 'YearsAtCompany' column as containing outliers
* I reviewed the data I decided to drop outliers using the interquortile rule
* This removed employees with a 'YearsAtCompany' value greater than 27 from the dataset
* Dropped lines totaled 19

In [27]:
def get_outliers_iqr(df, col):
    
    q75, q25 = np.percentile(df[col], [75,25])
    upper_bound = 3*stats.iqr(df[col]) + q75
    lower_bound = q25 - 3*stats.iqr(df[col])
        
    return upper_bound, lower_bound

In [28]:
get_outliers_iqr(df, ['YearsAtCompany'])

(27.0, -15.0)

In [29]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        q75, q25 = np.percentile(df[col], [75,25])
        ub = 3*stats.iqr(df[col]) + q75
        lb = q25 - 3*stats.iqr(df[col])
        df = df[df[col] <= ub]
        df = df[df[col] >= lb]
    return df

In [30]:
df = remove_outliers_iqr(df, ['YearsAtCompany'])

In [31]:
employee_churn_outliers_removed = df

employee_churn_outliers_removed.YearsAtCompany.max()

27

In [32]:
employee_churn_outliers_removed.shape

(1451, 31)

# Create CSV with Prepared Data
* The CSV was then used to explore the data and create models in DataRobot

In [33]:
#employee_churn_outliers_removed.to_csv('employee_churn_outliers_removed.csv',index=False)

# Explore Data, Build and Evaluate Models in DataRobot
* Entered CSV into DataRobot
* Explored how each feature is related to attrition
* Ran initial models
* Created additional feature lists to model on based on feature impact and feature collinearity
* Ran additional models using new feature lists
* Trained models on trian data 
* Evaluated Models on train and validate data 
* Used Max ROI as the key evaluation metric
* The sample size of the training and validate data was 1161 
* Considered feature engineering options

# Feature Engineering
* After my evaluating the models created using the fully prepared data I returned to Pandas for some feature engineering
* Values with similar rates of Attrition within a given categorical feature were merged together in ‘value clusters’
* When Datarobot creates models, it encodes each value in categorical features as separate feature
* Creating value clusters would reduce the number of post-encoding features that would go into each model while preserving most of the information contained in each of the pre-encoding features
* This will reduce noise and likely result in more accurate model predictions

**The following features were created using 'value clusters’ of a parent feature. The parent feature was then dropped from the dataset.**
* JobRoleCluster
* StockOptionLevelCluster
* JobLevelCluster
* JobSatisfactionCluster
* EnvironmentSatisfactionCluster
* EducationFieldCluster
* RelationshipSatisfactionCluster
* MaritalStatusCluster
* WorkLifeBalanceCluster
* JobInvolvementCluster



## Creating ‘Value Cluster’ Features

In [34]:
df['JobRoleCluster'] = df.JobRole

df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Sales Executive','Cluster 1'))
df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Research Scientist','Cluster 1'))

df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Laboratory Technician','Cluster 2'))
df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Human Resources','Cluster 2'))

df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Manufacturing Director','Cluster 3'))
df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Healthcare Representative','Cluster 3'))
df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Manager','Cluster 3'))
df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Research Director','Cluster 3'))

df.JobRoleCluster = df.JobRoleCluster.apply(lambda value : value.replace('Sales Representative','Cluster 4'))

df.JobRoleCluster.value_counts()

Cluster 1    617
Cluster 3    440
Cluster 2    311
Cluster 4     83
Name: JobRoleCluster, dtype: int64

In [36]:
df['StockOptionLevelCluster'] = df.StockOptionLevel

df.StockOptionLevelCluster = df.StockOptionLevelCluster.apply(lambda value : value.replace('zero','Cluster 1'))

df.StockOptionLevelCluster = df.StockOptionLevelCluster.apply(lambda value : value.replace('one','Cluster 2'))
df.StockOptionLevelCluster = df.StockOptionLevelCluster.apply(lambda value : value.replace('two','Cluster 2'))

df.StockOptionLevelCluster = df.StockOptionLevelCluster.apply(lambda value : value.replace('three','Cluster 3'))


df.StockOptionLevelCluster.value_counts()

Cluster 2    742
Cluster 1    624
Cluster 3     85
Name: StockOptionLevelCluster, dtype: int64

In [38]:
df['JobLevelCluster'] = df.JobLevel

df.JobLevelCluster = df.JobLevelCluster.apply(lambda value : value.replace('one','Cluster 1'))

df.JobLevelCluster = df.JobLevelCluster.apply(lambda value : value.replace('three','Cluster 2'))

df.JobLevelCluster = df.JobLevelCluster.apply(lambda value : value.replace('two','Cluster 3'))
df.JobLevelCluster = df.JobLevelCluster.apply(lambda value : value.replace('four','Cluster 3'))
df.JobLevelCluster = df.JobLevelCluster.apply(lambda value : value.replace('five','Cluster 3'))

df['JobLevelCluster'].value_counts()

Cluster 3    692
Cluster 1    543
Cluster 2    216
Name: JobLevelCluster, dtype: int64

In [40]:
df['JobSatisfactionCluster'] = df.JobSatisfaction

df.JobSatisfactionCluster = df.JobSatisfactionCluster.apply(lambda value : value.replace('very_high','Cluster 1'))

df.JobSatisfactionCluster = df.JobSatisfactionCluster.apply(lambda value : value.replace('high','Cluster 2'))
df.JobSatisfactionCluster = df.JobSatisfactionCluster.apply(lambda value : value.replace('meduim','Cluster 2'))

df.JobSatisfactionCluster = df.JobSatisfactionCluster.apply(lambda value : value.replace('low','Cluster 3'))


df['JobSatisfactionCluster'].value_counts()

Cluster 2    712
Cluster 1    454
Cluster 3    285
Name: JobSatisfactionCluster, dtype: int64

In [42]:
df['EnvironmentSatisfactionCluster'] = df.EnvironmentSatisfaction

df.EnvironmentSatisfactionCluster = df.EnvironmentSatisfactionCluster.apply(lambda value : value.replace('very_high','Cluster 1'))

df.EnvironmentSatisfactionCluster = df.EnvironmentSatisfactionCluster.apply(lambda value : value.replace('high','Cluster 2'))
df.EnvironmentSatisfactionCluster = df.EnvironmentSatisfactionCluster.apply(lambda value : value.replace('meduim','Cluster 2'))
df.EnvironmentSatisfactionCluster = df.EnvironmentSatisfactionCluster.apply(lambda value : value.replace('low','Cluster 2'))


df['EnvironmentSatisfactionCluster'].value_counts()

Cluster 2    1012
Cluster 1     439
Name: EnvironmentSatisfactionCluster, dtype: int64

In [44]:
df['EducationFieldCluster'] = df.EducationField

df.EducationFieldCluster = df.EducationFieldCluster.apply(lambda value : value.replace('Life Sciences','Cluster 1'))
df.EducationFieldCluster = df.EducationFieldCluster.apply(lambda value : value.replace('Medical','Cluster 1'))
df.EducationFieldCluster = df.EducationFieldCluster.apply(lambda value : value.replace('Other','Cluster 1'))

df.EducationFieldCluster = df.EducationFieldCluster.apply(lambda value : value.replace('Marketing','Cluster 2'))
df.EducationFieldCluster = df.EducationFieldCluster.apply(lambda value : value.replace('Technical Degree','Cluster 2'))

df.EducationFieldCluster = df.EducationFieldCluster.apply(lambda value : value.replace('Human Resources','Cluster 3'))


df['EducationFieldCluster'].value_counts()

Cluster 1    1137
Cluster 2     287
Cluster 3      27
Name: EducationFieldCluster, dtype: int64

In [46]:
df['RelationshipSatisfactionCluster'] = df.RelationshipSatisfaction

df.RelationshipSatisfactionCluster = df.RelationshipSatisfactionCluster.apply(lambda value : value.replace('very_high','Cluster 1'))
df.RelationshipSatisfactionCluster = df.RelationshipSatisfactionCluster.apply(lambda value : value.replace('high','Cluster 1'))
df.RelationshipSatisfactionCluster = df.RelationshipSatisfactionCluster.apply(lambda value : value.replace('meduim','Cluster 1'))

df.RelationshipSatisfactionCluster = df.RelationshipSatisfactionCluster.apply(lambda value : value.replace('low','Cluster 2'))


df['RelationshipSatisfactionCluster'].value_counts()

Cluster 1    1178
Cluster 2     273
Name: RelationshipSatisfactionCluster, dtype: int64

In [48]:
df['MaritalStatusCluster'] = df.MaritalStatus

df.MaritalStatusCluster = df.MaritalStatusCluster.apply(lambda value : value.replace('Married','Cluster 1'))
df.MaritalStatusCluster = df.MaritalStatusCluster.apply(lambda value : value.replace('Divorced','Cluster 1'))

df.MaritalStatusCluster = df.MaritalStatusCluster.apply(lambda value : value.replace('Single','Cluster 2'))



df['MaritalStatusCluster'].value_counts()

Cluster 1    986
Cluster 2    465
Name: MaritalStatusCluster, dtype: int64

In [50]:
df['WorkLifeBalanceCluster'] = df.WorkLifeBalance

df.WorkLifeBalanceCluster = df.WorkLifeBalanceCluster.apply(lambda value : value.replace('best','Cluster 1'))
df.WorkLifeBalanceCluster = df.WorkLifeBalanceCluster.apply(lambda value : value.replace('better','Cluster 1'))
df.WorkLifeBalanceCluster = df.WorkLifeBalanceCluster.apply(lambda value : value.replace('good','Cluster 1'))

df.WorkLifeBalanceCluster = df.WorkLifeBalanceCluster.apply(lambda value : value.replace('bad','Cluster 2'))

df['WorkLifeBalanceCluster'].value_counts()

Cluster 1    1372
Cluster 2      79
Name: WorkLifeBalanceCluster, dtype: int64

In [52]:
df['JobInvolvementCluster'] = df.JobInvolvement

df.JobInvolvementCluster = df.JobInvolvementCluster.apply(lambda value : value.replace('very_high','Cluster 1'))
df.JobInvolvementCluster = df.JobInvolvementCluster.apply(lambda value : value.replace('high','Cluster 1'))

df.JobInvolvementCluster = df.JobInvolvementCluster.apply(lambda value : value.replace('meduim','Cluster 2'))

df.JobInvolvementCluster = df.JobInvolvementCluster.apply(lambda value : value.replace('low','Cluster 3'))

df['JobInvolvementCluster'].value_counts()

Cluster 1    1002
Cluster 2     368
Cluster 3      81
Name: JobInvolvementCluster, dtype: int64

## Dropping Parent Columns of ‘Value Cluster’ Features

In [53]:
df = df.drop(columns=[ 'EducationField','EnvironmentSatisfaction','JobInvolvement',
                  'JobLevel', 'JobRole', 'JobSatisfaction', 
                  'MaritalStatus','RelationshipSatisfaction','StockOptionLevel', 
                  'WorkLifeBalance','EducationField'])

In [54]:
df.shape

(1451, 31)

In [55]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'Gender', 'HourlyRate',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'JobRoleCluster',
       'StockOptionLevelCluster', 'JobLevelCluster', 'JobSatisfactionCluster',
       'EnvironmentSatisfactionCluster', 'EducationFieldCluster',
       'RelationshipSatisfactionCluster', 'MaritalStatusCluster',
       'WorkLifeBalanceCluster', 'JobInvolvementCluster'],
      dtype='object')

# Create CSV with Prepared And Feature Engineered Data

In [56]:
#df.to_csv('employee_churn_catagorical_clusters.csv',index=False)

# Explore Data, Build and Evaluate Models on Feature Engineered Data in DataRobot
* Entered CSV into DataRobot
* Explored how each feature is related to attrition
* Ran initial models
* Created additional feature lists to model on based on feature impact and feature collinearity
* Ran additional models using new feature lists
* Trained models on trian data 
* Evaluated Models on train and validate data 
* Used Max ROI as the key evaluation metric
* The sample size of the training and validate data was 1161  

# Choosing a Final Model
* Compared the of the top preforming model containing ‘value clustered’ features and the top model containing no ‘value clustered’ features
* Comparison was on training data using Max ROI as the key evaluation metric

## Top Model

* Light Gradient Boosting on ElasticNet Predictions
* Trained with ‘value clustered’ features
* Top three Impactful features include: Overtime, JobLevelCluster, and NumCompaniesWorked
* Max ROI of 428K on the training data
* Sample size 1161

# Evaluating Final Model

* Unlocked holdout data
