In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix



In [3]:
data = pd.read_csv('general_data.csv')
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [14]:
data['NumCompaniesWorked'].fillna(method= 'bfill', inplace = True)
data['TotalWorkingYears'].fillna(method= 'ffill', inplace = True)
data.isna().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [16]:
data.drop(columns = ['EmployeeID', 'EmployeeCount', 'Over18', 'StandardHours'], inplace = True)

In [17]:
from sklearn.preprocessing import LabelEncoder
def encode(cols):
    enc = LabelEncoder()
    for col in cols:
        data[col] = enc.fit_transform(data[col])


In [18]:
encode_cols = ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']
encode(encode_cols)

In [19]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,2,2,6,2,1,0,1,0,1,131160,1.0,11,0,1.0,6,1,0,0
1,31,1,1,1,10,1,1,0,1,6,2,41890,0.0,23,1,6.0,3,5,1,4
2,32,0,1,1,17,4,4,1,4,7,1,193280,1.0,15,3,5.0,2,5,0,3
3,38,0,0,1,2,5,1,1,3,1,1,83210,3.0,11,3,13.0,5,8,7,5
4,32,0,2,1,10,1,3,1,1,7,2,23420,4.0,12,2,9.0,2,6,0,4


In [24]:
x = data.drop(columns = ['Attrition'])
y = data.iloc[:,1:2]
print(x.head())
print(y.head())

   Age  BusinessTravel  Department  DistanceFromHome  Education  \
0   51               2           2                 6          2   
1   31               1           1                10          1   
2   32               1           1                17          4   
3   38               0           1                 2          5   
4   32               2           1                10          1   

   EducationField  Gender  JobLevel  JobRole  MaritalStatus  MonthlyIncome  \
0               1       0         1        0              1         131160   
1               1       0         1        6              2          41890   
2               4       1         4        7              1         193280   
3               1       1         3        1              1          83210   
4               3       1         1        7              2          23420   

   NumCompaniesWorked  PercentSalaryHike  StockOptionLevel  TotalWorkingYears  \
0                 1.0                 11       

In [25]:
x1 = sm.add_constant(x)
result = sm.Logit(y, x1).fit()
result.summary()

  return ptp(axis=axis, out=out, **kwargs)


Optimization terminated successfully.
         Current function value: 0.392997
         Iterations 7


0,1,2,3
Dep. Variable:,Attrition,No. Observations:,4410.0
Model:,Logit,Df Residuals:,4390.0
Method:,MLE,Df Model:,19.0
Date:,"Mon, 22 Mar 2021",Pseudo R-squ.:,0.1103
Time:,20:30:49,Log-Likelihood:,-1733.1
converged:,True,LL-Null:,-1947.9
,,LLR p-value:,3.117e-79

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0681,0.414,0.165,0.869,-0.743,0.879
Age,-0.0305,0.007,-4.477,0.000,-0.044,-0.017
BusinessTravel,-0.0173,0.065,-0.264,0.792,-0.146,0.111
Department,-0.2425,0.081,-2.985,0.003,-0.402,-0.083
DistanceFromHome,-0.0013,0.005,-0.248,0.804,-0.012,0.009
Education,-0.0625,0.043,-1.466,0.143,-0.146,0.021
EducationField,-0.0966,0.033,-2.895,0.004,-0.162,-0.031
Gender,0.0851,0.090,0.950,0.342,-0.090,0.261
JobLevel,-0.0238,0.040,-0.600,0.548,-0.101,0.054


The most significant features affecting attrition are:
Age,
MaritalStatus,
NumCompaniesWorked,
TotalWorkingYears,
TrainingTimesLastYear,
YearsSinceLastPromotion,
YearsWithCurrManager	

In [27]:
from sklearn.model_selection import train_test_split
X = data[['Age', 'MaritalStatus', 'NumCompaniesWorked', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsSinceLastPromotion', 'YearsWithCurrManager']]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3528, 7) (882, 7) (3528, 1) (882, 1)


In [30]:
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_test)
y_pred

  y = column_or_1d(y, warn=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [31]:
print(confusion_matrix(y_test, y_pred))

[[736   3]
 [139   4]]


In [32]:
print(log_reg.score(x_test, y_test))

0.8390022675736961
