# Predictive modelling of attrition using regular ML models from Python's SKLearn

In [1]:
# data manipulation libraries
import numpy as np
import pandas as pd

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# machine learning tools
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import time

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# loading the dataset into a dataframe
attrition = pd.read_csv('attrition_df.csv')
print(f"The data has {attrition.shape[0]} rows and {attrition.shape[1]} columns.")
attrition.sample(5)

The data has 2000001 rows and 12 columns.


Unnamed: 0,ee#,age,department,province,level,gender,ethnicity,hire_source,promotion,tenure,previous_xp,total_xp
1830692,11830692,61,Marketing,British Columbia,Individual contributor,female,african descent,direct,not_promoted,26.4,9.6,36.0
1374975,11374975,33,Sales,Yukon Territory,Individual contributor,female,asian,direct,promoted,1.2,6.8,8.0
555824,10555824,37,Sales,Nunavut,Individual contributor,male,hispanic,agency,not_promoted,0.8,11.2,12.0
1932134,11932134,47,Customer Service,Northwest Territories,Individual contributor,female,caucasian,referral,promoted,10.6,11.4,22.0
1564589,11564589,60,Customer Service,British Columbia,Individual contributor,male,caucasian,direct,not_promoted,1.3,28.8,30.1


In [3]:
# getting basic information about the data
summary = []

for i in attrition.columns:
    column_info = {}
    column_info['name'] = attrition[i].name
    column_info['data type'] = attrition[i].dtypes
    column_info['example'] = attrition[i].iloc[0]
    column_info['unique#'] = attrition[i].nunique()
    summary.append(column_info)

summary = pd.DataFrame(summary)
summary

Unnamed: 0,name,data type,example,unique#
0,ee#,int64,10000000,2000001
1,age,int64,60,46
2,department,object,HR,8
3,province,object,Nova Scotia,13
4,level,object,Individual contributor,3
5,gender,object,female,3
6,ethnicity,object,caucasian,8
7,hire_source,object,agency,3
8,promotion,object,not_promoted,2
9,tenure,float64,16.7,1226


## Dataframe preparation

In [None]:
# creating a value to be predicted
# initializing target variable with ratio 60/40
num_samples= len(attrition)
status = np.random.choice(a= ['active', 'left'],
                          size= num_samples,
                          p= [0.6, 0.4])

# introducing subtle patterns in the data at 30%
for i in range(num_samples):
    if attrition['department'][i] == 'Project Management':
        status[i] = 'left' if np.random.rand() < 0.30 else status[i]
    if attrition['age'][i] > 50:
        status[i] = 'left' if np.random.rand() < 0.30 else status[i]
    if attrition['hire_source'][i] == 'direct':
        status[i] = 'left' if np.random.rand() < 0.30 else status[i]
    if attrition['province'][i] == 'Alberta':
        status[i] = 'left' if np.random.rand() < 0.30 else status[i]

attrition['status'] = status
attrition.sample(5)

The target is initialized with a 60/40 ratio to provide both enough active and left cases to the model. 
The patterns introduced in the data are so that if any of the below conditions is met, the status 'left' is 30% likely to occur:
- Employees in 'Project Management'
- Employees above 50
- Employees hired directly
- Employees in Alberta

In [None]:
# breaking down the dataframe into individual predictors
attr_dummy_df= pd.get_dummies(data= attrition,
                             dtype= 'float64')

# dropping redundant columns and renaming the remaining ones
attr_dummy_df= attr_dummy_df.drop(columns=['status_left','promotion_not_promoted'],
                                  axis=1)
attr_dummy_df= attr_dummy_df.rename(columns={'promotion_promoted':'promotion',
                                             'status_active':'status'})
attr_dummy_df.sample(5)

In [None]:
# correlation matrix
attr_corr_matrix= attr_dummy_df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(attr_corr_matrix, 
            annot= True,
            linewidths= 0.5,
            fmt= ".2f")
plt.show()

The heatmap is difficult to read so let's only keep a few columns to improve readability.

In [None]:
# only keeping age, experience and two columns per type of data that contains a predictor
simplified_dummy_df = attr_dummy_df[['ee#', 'age', 'previous_xp', 'total_xp',
                                     'department_Project Management', 'department_HR',
                                     'hire_source_direct', 'hire_source_referral',
                                     'province_Alberta', 'province_Ontario',
                                     'status']]
simplified_dummy_df.sample(5)

In [None]:
# simplified correlation matrix
simplified_corr_matrix= simplified_dummy_df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(simplified_corr_matrix, 
            annot= True,
            linewidths= 0.5,
            fmt= ".2f")
plt.show()

## Data splitting and model initializations

In [None]:
# separating target variables from features with patterns
X_dept= pd.DataFrame(attr_dummy_df['department_Project Management'])
X_age= pd.DataFrame(attr_dummy_df['age'])
X_source= pd.DataFrame(attr_dummy_df['hire_source_direct'])
X_province= pd.DataFrame(attr_dummy_df['province_Alberta'])

y= attr_dummy_df['status']

In [None]:
# splitting department predictor into train vs test data
X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y, 
                                                    train_size= 0.7,
                                                    test_size= 0.3)

# splitting age predictor into train vs test data
X_train_age, X_test_age, y_train_age, y_test_age = train_test_split(X_age, y, 
                                                    train_size= 0.7,
                                                    test_size= 0.3)

# splitting source predictor into train vs test data
X_train_source, X_test_source, y_train_source, y_test_source = train_test_split(X_source, y, 
                                                    train_size= 0.7,
                                                    test_size= 0.3)

# splitting province predictor into train vs test data
X_train_province, X_test_province, y_train_province, y_test_province = train_test_split(X_province, y, 
                                                    train_size= 0.7,
                                                    test_size= 0.3)

In [None]:
# initializing models
regular_models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

In [None]:
# train, test and evaluate
def train_eval_models(models, X_train, X_test, y_train, y_test):
    model_scores = []

    for name, model in models.items():
        model_info = {}
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        train_accuracy = accuracy_score(y_true= y_train,
                                        y_pred= y_pred_train)
        test_accuracy = accuracy_score(y_true= y_test,
                                       y_pred= y_pred_test)
        model_info['name'] = name
        model_info['train_accuracy'] = f'{train_accuracy *100:.2f}%'
        model_info['test_accuracy'] = f'{test_accuracy *100:.2f}%'
        model_scores.append(model_info)
        
    return pd.DataFrame(model_scores)

In [None]:
# running the prediction for dept
reg_model_scores_dept = train_eval_models(regular_models,
                                           X_train_dept, 
                                           X_test_dept, 
                                           y_train_dept, 
                                           y_test_dept)
reg_model_scores_dept

In [None]:
# running the prediction for age
reg_model_age = train_eval_models(regular_models,
                                  X_train_age,
                                  X_test_age,
                                  y_train_age,
                                  y_test_age)
reg_model_age

In [None]:
# running the prediction for province
reg_model_province = train_eval_models(regular_models,
                                  X_train_province,
                                  X_test_province,
                                  y_train_province,
                                  y_test_province)
reg_model_province

In [None]:
# running the prediction for hire source
reg_model_source = train_eval_models(regular_models,
                                  X_train_source,
                                  X_test_source,
                                  y_train_source,
                                  y_test_source)
reg_model_source