In [1]:
# Python Libraray Packages Used
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RF


"""import warnings filter"""
from warnings import simplefilter
"""ignore all future warnings"""
simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Read csv file for existing employee for predcition
Existing_Employ = pd.read_csv("/content/dataset_existing_employees.csv")

Existing_Employ.head(5)

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,2001,0.58,0.74,4,215,3,0,0,sales,low
1,2002,0.82,0.67,2,202,3,0,0,sales,low
2,2003,0.45,0.69,5,193,3,0,0,sales,low
3,2004,0.78,0.82,5,247,3,0,0,sales,low
4,2005,0.49,0.6,3,214,2,0,0,sales,low


In [3]:
# ExE.tail(5)


In [4]:
# Salary level data conversion for existing employee in company X
# low salary = 1, medium salary = 2, high salary = 3
Existing_Employ2 = Existing_Employ
salary_level = {'low': 1, 'medium': 2, 'high': 3}
Existing_Employ2['salary'] = Existing_Employ2["salary"].apply(lambda x: salary_level[x])

Existing_Employ2.head(5)

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,2001,0.58,0.74,4,215,3,0,0,sales,1
1,2002,0.82,0.67,2,202,3,0,0,sales,1
2,2003,0.45,0.69,5,193,3,0,0,sales,1
3,2004,0.78,0.82,5,247,3,0,0,sales,1
4,2005,0.49,0.6,3,214,2,0,0,sales,1


In [5]:
# Step 1: Specify Prediction Target

In [6]:
# print the list of columns in the dataset to find the name of the prediction target
Existing_Employ.columns

Index(['Emp ID', 'satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'dept', 'salary'],
      dtype='object')

In [7]:
y = Existing_Employ2.satisfaction_level
y # output target variable for prediction

0        0.58
1        0.82
2        0.45
3        0.78
4        0.49
         ... 
11423    0.90
11424    0.74
11425    0.85
11426    0.33
11427    0.50
Name: satisfaction_level, Length: 11428, dtype: float64

In [8]:
# Create the list of features below / called feature engineering / feature extraction
feature_names = ['last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary']

# Select data corresponding to features in feature_names
X = Existing_Employ2[feature_names]



In [9]:
# Step2: Review Data
# Before building a model, take a quick look at X to verify it looks sensible

In [10]:
# Review data
# print description or statistics from X
X.head()

Unnamed: 0,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary
0,0.74,4,215,3,0,0,1
1,0.67,2,202,3,0,0,1
2,0.69,5,193,3,0,0,1
3,0.82,5,247,3,0,0,1
4,0.6,3,214,2,0,0,1


In [11]:
# print the top few lines
X.describe() 

Unnamed: 0,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary
count,11428.0,11428.0,11428.0,11428.0,11428.0,11428.0,11428.0
mean,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251,1.650945
std,0.162005,0.979884,45.682731,1.562348,0.379991,0.159889,0.655277
min,0.36,2.0,96.0,2.0,0.0,0.0,1.0
25%,0.58,3.0,162.0,2.0,0.0,0.0,1.0
50%,0.71,4.0,198.0,3.0,0.0,0.0,2.0
75%,0.85,4.0,238.0,4.0,0.0,0.0,2.0
max,1.0,6.0,287.0,10.0,1.0,1.0,3.0


In [12]:
# Step 3: Specify and Fit Model
# Create a DecisionTreeRegressor and save it. Ensure you've done the relevant import from sklearn to run this command.
# Then fit the model you just created using the data in X and y that you saved above.

In [13]:
# specify the model. 
# for model reproducibility, set a numeric value for random_state when specifying the model
exisiting_leaving_model = DTR(random_state=1)

# Fit the model
exisiting_leaving_model.fit(X, y)


DecisionTreeRegressor(random_state=1)

In [14]:
# Step 4: Make Predictions
# Make predictions with the model's predict command using X as the data. Save the results to a variable called predictions.

In [15]:
predictions = exisiting_leaving_model.predict(X)
print(predictions)
print ("Number of employee leaving in the future", len(predictions))

[0.63 0.82 0.45 ... 0.85 0.33 0.5 ]
Number of employee leaving in the future 11428


In [16]:
#print(y.tail(20).tolist())
#NumOfEmpLeaving = predictions.tolist()
#len(NumOfEmpLeaving)


In [17]:
# Mean Absolute Error for In-sample 
from sklearn.metrics import mean_absolute_error

predicted_current_employee_leaving = exisiting_leaving_model.predict(X)
mae = mean_absolute_error(y, predicted_current_employee_leaving)

print("In-Sample/Non Validation Train Data MAE:", mae)


In-Sample/Non Validation Train Data MAE: 0.002541127056352818


In [18]:
# STEP 5: Model Validation
# Step 1: Split Data using train_test_split from sklearn
# Use the `train_test_split` function to split up your data.

from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)



In [19]:
# Step 2: Specify and Fit the Model
# Create a DecisionTreeRegressor model and fit it to the relevant data. Set random_state to 1 again when creating the model.

# Specify the model
ELCompX_model = DTR(random_state = 1)

# Fit iowa_model with the training data.
ELCompX_model.fit(train_X, train_y)


DecisionTreeRegressor(random_state=1)

In [20]:
# Step 3: Make Predictions with Validation data
# Predict with all validation observations
val_predictions = ELCompX_model.predict(val_X)


In [21]:
# Inspect your predictions and actual values from validation data.
# print the top few validation predictions
print("Validation prediction of current employee leaving company X:", ELCompX_model.predict(val_X.head()))
# print the top few actual prices from validation data
print("Actual target current employee leaving company X from validation data:", val_y.head().tolist())
#print(len(val_X))
#print(len(val_y))
print ("Number of employee leaving in the future based on validation prediction:", len(val_X))
print ("Number of actual target exixting employees leaving in the future based on validation data:", len(val_y))

Validation prediction of current employee leaving company X: [0.44 0.81 0.56 0.64 0.26]
Actual target current employee leaving company X from validation data: [0.44, 0.81, 0.5, 0.64, 0.59]
Number of employee leaving in the future based on validation prediction: 2857
Number of actual target exixting employees leaving in the future based on validation data: 2857


In [22]:
# Step 4: Calculate the Mean Absolute Error in Validation Data

from sklearn.metrics import mean_absolute_error

val_mae = mean_absolute_error(val_y, val_predictions)
print("In-Sample/Non Validation Train Data MAE:", mae)
print("Out-of-Sample/Validation Train Data MAE:", val_mae)

In-Sample/Non Validation Train Data MAE: 0.002541127056352818
Out-of-Sample/Validation Train Data MAE: 0.18425737953564347


In [23]:
# Finally, MAE: the lower the value the better and 0 means the model is perfect. 

In [24]:
"""Save into csv file"""
#good_employees_still_working_df.to_csv("employees_leaving_prob.csv")



'Save into csv file'

In [25]:
# Random Forests
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state = 1)

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

# print("Validation prediction of current employee leaving company X", ELCompX_model.predict(val_X.head()))
# print the top few actual prices from validation data
#print("Actual target current employee leaving company X from validation data", rf_val_predictions.tolist())

print("Actual target current employees leaving company X from validation data is", len(rf_val_predictions))

print("Out-of-Sample/Validation Train Data MAE for Decision Tree:", val_mae)
print("Validation MAE for Random Forest Model:" ,rf_val_mae)

# Finally, The MAE for both Decision Tree and Random Forest ML models has zero 0 meaning the model is perfect. 

Actual target current employees leaving company X from validation data is 2857
Out-of-Sample/Validation Train Data MAE for Decision Tree: 0.18425737953564347
Validation MAE for Random Forest Model: 0.1544837593546344
