In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [19]:
data = pd.read_csv('HR_Employee_Attrition.csv')
df = data.drop(columns=['Education','EducationField','EmployeeCount','EmployeeNumber','Gender','HourlyRate',
'JobInvolvement','JobLevel','JobRole','MaritalStatus','MonthlyIncome','MonthlyRate','NumCompaniesWorked',
'Over18','OverTime','PercentSalaryHike','RelationshipSatisfaction','StandardHours','StockOptionLevel','TotalWorkingYears',
'TrainingTimesLastYear','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'])
#['Age']['Attrition']['BusinessTravel']['DailyRate']['Department']['DistanceFromHome']['EnvironmentSatisfaction']
# #['JobSatisfaction']['PerformanceRating']['WorkLifeBalance']['YearsAtCompany']
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,EnvironmentSatisfaction,JobSatisfaction,PerformanceRating,WorkLifeBalance,YearsAtCompany
0,41,Yes,Travel_Rarely,1102,Sales,1,2,4,3,1,6
1,49,No,Travel_Frequently,279,Research & Development,8,3,2,4,3,10
2,37,Yes,Travel_Rarely,1373,Research & Development,2,4,3,3,3,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,3,3,3,8
4,27,No,Travel_Rarely,591,Research & Development,2,1,2,3,3,2


In [20]:
print(df.isna().sum())

Age                        0
Attrition                  0
BusinessTravel             0
DailyRate                  0
Department                 0
DistanceFromHome           0
EnvironmentSatisfaction    0
JobSatisfaction            0
PerformanceRating          0
WorkLifeBalance            0
YearsAtCompany             0
dtype: int64


In [21]:
df['Attrition'].replace(['Yes', 'No'], [1, 0], inplace = True)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,EnvironmentSatisfaction,JobSatisfaction,PerformanceRating,WorkLifeBalance,YearsAtCompany
0,41,1,Travel_Rarely,1102,Sales,1,2,4,3,1,6
1,49,0,Travel_Frequently,279,Research & Development,8,3,2,4,3,10
2,37,1,Travel_Rarely,1373,Research & Development,2,4,3,3,3,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,3,3,3,8
4,27,0,Travel_Rarely,591,Research & Development,2,1,2,3,3,2


In [22]:
#pd.get_dummies(df)
#df.head()
print(df['BusinessTravel'].unique())
print(df['Department'].unique())
ohe = pd.get_dummies(df, columns=['BusinessTravel', 'Department'])
ohe.head()


['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
['Sales' 'Research & Development' 'Human Resources']


Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,EnvironmentSatisfaction,JobSatisfaction,PerformanceRating,WorkLifeBalance,YearsAtCompany,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales
0,41,1,1102,1,2,4,3,1,6,0,0,1,0,0,1
1,49,0,279,8,3,2,4,3,10,0,1,0,0,1,0
2,37,1,1373,2,4,3,3,3,0,0,0,1,0,1,0
3,33,0,1392,3,4,3,3,3,8,0,1,0,0,1,0
4,27,0,591,2,1,2,3,3,2,0,0,1,0,1,0


In [23]:
X_train, X_test, y_train, y_test = train_test_split(ohe.drop(columns = ['Attrition']), ohe['Attrition'], test_size=0.2, stratify = ohe['Attrition'], random_state=10302000)
X_train
X_test
y_train
y_test

554     0
1446    0
621     0
1200    0
1132    0
       ..
223     0
734     0
327     1
722     0
166     0
Name: Attrition, Length: 294, dtype: int64

In [24]:
model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = 'l2', C=3, max_iter=1000)
# If the lbfgs throws an error, try to increase max_iter (add max_iter = 1000), also try another algorithm, scaling is also suggested
# While using multiclass case do multi_class = 'over' or 'auto'; can also try other solvers
# While doing regularization, use penalty = 'l2' and also C = 10.0 (need to try other values too)

model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
print(model.score(X_train, y_train)) 

# This is the coefficient Beta_1, ..., Beta_7
print(model.coef_)

# This is the coefficient Beta_0
model.intercept_

0.8511904761904762
[[-4.44368645e-02 -4.72257794e-04  3.21494872e-02 -2.26418519e-01
  -2.80044836e-01 -1.24264270e-01 -3.38204798e-01 -7.29688050e-02
  -1.58730831e-01  1.29746361e+00  6.28321418e-01  8.02998087e-01
   1.76171928e-01  7.87884182e-01]]


array([1.91092442])

In [25]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Attrition'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
print('Percentage of correct predictions is ')
print(model.score(X_test, y_test))


Percentage of correct predictions is 
0.8367346938775511


In [26]:
test_output.head()

Unnamed: 0,pred_Attrition,Attrition
554,0,0
1446,0,0
621,0,0
1200,0,0
1132,0,0


In [27]:
ohe['MaritalStatus'] = data['MaritalStatus']
print(ohe['MaritalStatus'].unique())
ohe.head()

['Single' 'Married' 'Divorced']


Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,EnvironmentSatisfaction,JobSatisfaction,PerformanceRating,WorkLifeBalance,YearsAtCompany,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,MaritalStatus
0,41,1,1102,1,2,4,3,1,6,0,0,1,0,0,1,Single
1,49,0,279,8,3,2,4,3,10,0,1,0,0,1,0,Married
2,37,1,1373,2,4,3,3,3,0,0,0,1,0,1,0,Single
3,33,0,1392,3,4,3,3,3,8,0,1,0,0,1,0,Married
4,27,0,591,2,1,2,3,3,2,0,0,1,0,1,0,Married


In [28]:
X_train, X_test, y_train, y_test = train_test_split(ohe.drop(columns = ['MaritalStatus']), ohe['MaritalStatus'], test_size=0.2, stratify = ohe['MaritalStatus'], random_state=10302000)
X_train
X_test
y_train
y_test

525      Single
219     Married
33      Married
792      Single
1312    Married
         ...   
1469    Married
430      Single
365     Married
889     Married
578      Single
Name: MaritalStatus, Length: 294, dtype: object

In [29]:
model = LogisticRegression(fit_intercept = True, solver='newton-cg', multi_class = 'auto', penalty = 'l2', C=0.1, max_iter=10000)
#model = LogisticRegression(fit_intercept = True, solver='saga', multi_class = 'auto', penalty = 'l2', C=0.1, max_iter=10000)
# If the lbfgs throws an error, try to increase max_iter (add max_iter = 1000), also try another algorithm, scaling is also suggested
# While using multiclass case do multi_class = 'over' or 'auto'; can also try other solvers
# While doing regularization, use penalty = 'l2' and also C = 10.0 (need to try other values too)

model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
print(model.score(X_train, y_train)) 

# This is the coefficient Beta_1, ..., Beta_7
print(model.coef_)

# This is the coefficient Beta_0
model.intercept_

0.4965986394557823
[[ 3.42105947e-03 -3.32392900e-01  1.69221542e-04  1.97237075e-03
   6.04323968e-03 -4.14536706e-02 -3.62112642e-02 -2.56143290e-02
   3.27980955e-03  1.17082812e-01 -1.84121893e-02 -9.86719572e-02
   1.52040412e-01 -2.83466275e-02 -1.23695119e-01]
 [ 1.19192811e-02 -2.80793165e-01  5.81140492e-05  3.91743466e-03
  -6.42608849e-02 -2.91113083e-02  3.81366203e-03 -1.11248394e-02
   1.88471458e-03 -9.85176262e-02 -9.63894378e-03  1.08157349e-01
   1.06123814e-01 -6.35520460e-02 -4.25709897e-02]
 [-1.53403406e-02  6.13186065e-01 -2.27335591e-04 -5.88980541e-03
   5.82176452e-02  7.05649789e-02  3.23976021e-02  3.67391684e-02
  -5.16452413e-03 -1.85651860e-02  2.80511331e-02 -9.48539134e-03
  -2.58164227e-01  9.18986735e-02  1.66266109e-01]]


array([-0.20556351,  0.13992114,  0.06564237])

In [30]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_MarriageStatus'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
print('Percentage of correct predictions is ')
print(model.score(X_test, y_test))

Percentage of correct predictions is 
0.46938775510204084


In [31]:
test_output.head()

Unnamed: 0,pred_MarriageStatus,MaritalStatus
525,Single,Single
219,Married,Married
33,Single,Married
792,Single,Single
1312,Single,Married
