# Employee Churn Prediction model

In [16]:
#Importing relevant libraries
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

In [17]:
# Reading the data into the environment
df = pd.read_csv(r'C:\Users\mosuj\Downloads\ChurnPrediction.csv')
df

Unnamed: 0,Age,PastEmployee,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,...,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,46,No,Travel_Rarely,Human Resources,5,2,Medical,2,Male,3,...,22,4,1,16,2,3,4,2,0,2
1,37,Yes,Travel_Rarely,Human Resources,6,4,Human Resources,3,Male,3,...,22,4,0,7,3,3,3,2,0,2
2,59,No,Non-Travel,Human Resources,2,4,Human Resources,3,Female,2,...,21,4,1,30,3,3,3,2,2,2
3,54,No,Non-Travel,Human Resources,26,3,Human Resources,4,Female,4,...,12,3,0,23,3,3,5,3,4,4
4,26,No,Travel_Rarely,Human Resources,25,1,Life Sciences,3,Female,3,...,23,4,1,8,3,3,8,7,5,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,45,No,Travel_Rarely,Sales,20,3,Life Sciences,4,Female,3,...,15,3,0,8,3,3,5,3,0,1
1466,50,Yes,Travel_Rarely,Sales,28,3,Marketing,4,Male,2,...,13,3,1,20,3,3,3,2,2,0
1467,39,No,Travel_Rarely,Sales,24,1,Marketing,2,Female,2,...,11,3,1,21,2,2,20,9,9,6
1468,26,No,Travel_Rarely,Sales,5,3,Other,4,Female,2,...,18,3,0,5,2,3,4,2,0,0


## Data Exploration

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1470 non-null   int64 
 1   PastEmployee             1470 non-null   object
 2   BusinessTravel           1470 non-null   object
 3   Department               1470 non-null   object
 4   DistanceFromHome         1470 non-null   int64 
 5   Education                1470 non-null   int64 
 6   EducationField           1470 non-null   object
 7   EnvironmentSatisfaction  1470 non-null   int64 
 8   Gender                   1470 non-null   object
 9   JobInvolvement           1470 non-null   int64 
 10  JobLevel                 1470 non-null   int64 
 11  JobRole                  1470 non-null   object
 12  JobSatisfaction          1470 non-null   int64 
 13  MaritalStatus            1470 non-null   object
 14  MonthlyIncome            1470 non-null  

In [19]:
# Transforming the variables making them have similar data types
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

ls = ['PastEmployee', 'Department', 'BusinessTravel', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
for col in ls:
    df[col]=le.fit_transform(df[col])

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Age                      1470 non-null   int64
 1   PastEmployee             1470 non-null   int32
 2   BusinessTravel           1470 non-null   int32
 3   Department               1470 non-null   int32
 4   DistanceFromHome         1470 non-null   int64
 5   Education                1470 non-null   int64
 6   EducationField           1470 non-null   int32
 7   EnvironmentSatisfaction  1470 non-null   int64
 8   Gender                   1470 non-null   int32
 9   JobInvolvement           1470 non-null   int64
 10  JobLevel                 1470 non-null   int64
 11  JobRole                  1470 non-null   int32
 12  JobSatisfaction          1470 non-null   int64
 13  MaritalStatus            1470 non-null   int32
 14  MonthlyIncome            1470 non-null   int64
 15  NumC

In [21]:
# Checking for duplicates
df.duplicated().sum()

0

In [22]:
# Checking for missing values
df.isnull().sum()

Age                        0
PastEmployee               0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EnvironmentSatisfaction    0
Gender                     0
JobInvolvement             0
JobLevel                   0
JobRole                    0
JobSatisfaction            0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
OverTime                   0
PercentSalaryHike          0
PerformanceRating          0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
WorkLifeBalance            0
YearsAtCompany             0
YearsInCurrentRole         0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [23]:
# Separating dependent variable from Independent variables
X = df.drop(columns=['PastEmployee'])
y = df['PastEmployee']

## Model Development

In [24]:
# Spliting the data into the training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [27]:
# Applying standardization to the models
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [37]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
import pickle
pickle.dump(scaler, open('scaling.pkl', 'wb'))

In [39]:
X_train

array([[-0.30624319,  0.5888015 ,  1.42215805, ...,  1.31368478,
         2.38815978,  1.61029956],
       [-1.17575686,  0.5888015 ,  1.42215805, ...,  0.75912411,
         1.46408953,  0.77886893],
       [-1.39313528,  0.5888015 ,  1.42215805, ..., -0.62727757,
        -0.69207439, -0.6068488 ],
       ...,
       [ 1.32409495,  0.5888015 , -0.47750034, ...,  0.75912411,
        -0.69207439, -1.16113589],
       [-0.30624319,  0.5888015 ,  1.42215805, ...,  0.75912411,
        -0.38405098,  0.77886893],
       [-0.4149324 ,  0.5888015 ,  1.42215805, ..., -1.18183824,
        -0.69207439, -1.16113589]])

### Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [41]:
logReg = LogisticRegression()
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_test)
        
print('confusion matrix is:\n', confusion_matrix(y_test, y_pred))
print('classification report is:\n', classification_report(y_test, y_pred))

confusion matrix is:
 [[297   9]
 [ 44  18]]
classification report is:
               precision    recall  f1-score   support

           0       0.87      0.97      0.92       306
           1       0.67      0.29      0.40        62

    accuracy                           0.86       368
   macro avg       0.77      0.63      0.66       368
weighted avg       0.84      0.86      0.83       368



In [32]:
# Displaying the coefficient of the model
print(logReg.coef_)

[[-0.331563   -0.02165199  0.39474913  0.1827825   0.01439015  0.18948152
  -0.50154222  0.12864883 -0.35541602 -0.297152   -0.21357824 -0.41623368
   0.36343982 -0.23825512  0.32546044  0.80446427 -0.09327604  0.04192564
  -0.16208335 -0.27649411 -0.27953636 -0.20395877  0.44237395 -0.48351996
   0.51865138 -0.50541467]]


In [33]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [34]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

0.14402173913043478
0.14402173913043478
0.3795019619586107


## Pickling the model for Deployment

In [35]:
import pickle

In [36]:
pickle.dump(logReg, open('logisticmodel.pkl', 'wb'))