In [1]:
# 데이터 분석
import pandas as pd
import numpy as np

# 통계
import pingouin as pg
from scipy.stats import chisquare

# 시각화
import seaborn as sns
import matplotlib.pyplot as plt

# 컬럼 출력 생략 해제
pd.set_option('display.max_columns', None)

# 시각화 한글 표시
import koreanize_matplotlib

# 그래프에 retina display 적용
%config InlineBackend.figure_format = 'retina'

# 머신러닝
from sklearn import preprocessing

In [2]:
# 데이터 로드
df_raw = pd.read_csv("https://raw.githubusercontent.com/JounKK/AIS8_Final_HRA/main/dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv", index_col="EmployeeNumber")
df_raw.shape

(1470, 34)

In [3]:
# 불필요한 컬럼 제거
df_raw = df_raw.drop(columns=['Over18', 'EmployeeCount', 'StandardHours','DailyRate', 'HourlyRate', 'MonthlyRate'])
df_raw.shape

(1470, 28)

In [4]:
# 수치형 변수 모음
number_cols = ['Age', 'DistanceFromHome', 'MonthlyIncome', 'NumCompaniesWorked',
               'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear',
               'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
               'YearsWithCurrManager']
number_cols

['Age',
 'DistanceFromHome',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [7]:
# 순서형 변수 모음
ordinal_cols = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement',
                'JobLevel', 'JobSatisfaction', 'PerformanceRating',
                'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance']
ordinal_cols

['Education',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'WorkLifeBalance']

In [6]:
# 범주형 변수 모음
cat_cols = ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
cat_cols

['Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [8]:
# 수치형 + 범주형 변수 모음
cat_ord_list = ordinal_cols + cat_cols
cat_ord_list

['Education',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'WorkLifeBalance',
 'Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [14]:
# Attrition에 따른 수치형 변수간 t-test
# Attrition은 각각의 수치형 변수에 대해 유의미한 차이가 있는가?


try:
    for col in number_cols:
        Attrition_yes = df_raw[df_raw['Attrition'] == 'Yes'][col]
        Attrition_no = df_raw[df_raw['Attrition'] == 'No'][col]
        display(col, pg.ttest(Attrition_yes, Attrition_no))
except:
    pass

'Age'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.828012,316.931112,two-sided,1.37976e-08,"[-5.29, -2.62]",0.438225,1181000.0,1.0


'DistanceFromHome'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.888183,322.724279,two-sided,0.004137,"[0.55, 2.89]",0.212401,4.71,0.849148


'MonthlyIncome'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-7.482622,412.740748,two-sided,4.433589e-13,"[-2583.05, -1508.24]",0.440018,44710000000.0,1.0


'NumCompaniesWorked'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.574651,317.138829,two-sided,0.116334,"[-0.07, 0.66]",0.118305,0.267,0.384891


'PercentSalaryHike'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.504245,326.107117,two-sided,0.61443,"[-0.66, 0.39]",0.03663,0.09,0.081033


'TotalWorkingYears'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-7.019179,350.876864,two-sided,1.159817e-11,"[-4.63, -2.6]",0.471815,1817000000.0,1.0


'TrainingTimesLastYear'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.330522,339.557098,two-sided,0.020364,"[-0.38, -0.03]",0.161916,1.135,0.626075


'YearsAtCompany'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.282596,338.213101,two-sided,2.285905e-07,"[-3.07, -1.4]",0.368551,63410.0,0.999387


'YearsInCurrentRole'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-6.847079,366.568825,two-sided,3.18739e-11,"[-2.04, -1.13]",0.442011,581100000.0,1.0


'YearsSinceLastPromotion'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.287927,338.491595,two-sided,0.198651,"[-0.73, 0.15]",0.089777,0.179,0.244156


'YearsWithCurrManager'

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-6.633399,365.098101,two-sided,1.185022e-10,"[-1.96, -1.07]",0.429743,146400000.0,1.0


* 주목해야할 유의미한 컬럼:

In [11]:
# Attrition에 따른 범주형(순서형) 변수간 카이제곱 검정
# Attrition은 각각의 범주형 변수에 대해 유의미한 차이가 있는가?

try:
    for col in cat_ord_list:
        expected, observed, stats = pg.chi2_independence(x='Attrition', y=col, data=df_raw)
        display(df_raw.pivot_table(index=col, columns='Attrition', aggfunc='size'))
        display(stats)
except:
    pass

Attrition,No,Yes
Education,Unnamed: 1_level_1,Unnamed: 2_level_1
1,139,31
2,238,44
3,473,99
4,340,58
5,43,5


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,3.073961,4.0,0.545525,0.045729,0.24971
1,cressie-read,0.666667,3.11246,4.0,0.539185,0.046014,0.252603
2,log-likelihood,0.0,3.199999,4.0,0.524931,0.046657,0.259195
3,freeman-tukey,-0.5,3.276024,4.0,0.51274,0.047208,0.264935
4,mod-log-likelihood,-1.0,3.362199,4.0,0.499138,0.047825,0.271459
5,neyman,-2.0,3.570535,4.0,0.467235,0.049284,0.287287


Attrition,No,Yes
EnvironmentSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
1,212,72
2,244,43
3,391,62
4,386,60


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,22.503881,3.0,5.1e-05,0.123729,0.986916
1,cressie-read,0.666667,21.798382,3.0,7.2e-05,0.121774,0.98427
2,log-likelihood,0.0,20.575349,3.0,0.000129,0.118308,0.978454
3,freeman-tukey,-0.5,19.802549,3.0,0.000187,0.116065,0.973799
4,mod-log-likelihood,-1.0,19.137295,3.0,0.000256,0.114099,0.969058
5,neyman,-2.0,18.081705,3.0,0.000423,0.110908,0.959879


Attrition,No,Yes
JobInvolvement,Unnamed: 1_level_1,Unnamed: 2_level_1
1,55,28
2,304,71
3,743,125
4,131,13


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,28.492021,3.0,3e-06,0.13922,0.997445
1,cressie-read,0.666667,27.348486,3.0,5e-06,0.136398,0.996479
2,log-likelihood,0.0,25.67222,3.0,1.1e-05,0.132152,0.994405
3,freeman-tukey,-0.5,24.869287,3.0,1.6e-05,0.130069,0.993038
4,mod-log-likelihood,-1.0,24.401459,3.0,2.1e-05,0.12884,0.992101
5,neyman,-2.0,24.360116,3.0,2.1e-05,0.12873,0.992012


Attrition,No,Yes
JobLevel,Unnamed: 1_level_1,Unnamed: 2_level_1
1,400,143
2,482,52
3,186,32
4,101,5
5,64,5


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,72.529013,4.0,6.634685e-15,0.222125,1.0
1,cressie-read,0.666667,72.380296,4.0,7.132576e-15,0.221897,1.0
2,log-likelihood,0.0,73.441263,4.0,4.256096e-15,0.223517,1.0
3,freeman-tukey,-0.5,75.620927,4.0,1.472561e-15,0.22681,1.0
4,mod-log-likelihood,-1.0,79.289566,4.0,2.463169e-16,0.232247,1.0
5,neyman,-2.0,93.089319,4.0,2.9041089999999996e-19,0.251647,1.0


Attrition,No,Yes
JobSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
1,223,66
2,234,46
3,369,73
4,407,52


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,17.505077,3.0,0.000556,0.109125,0.953865
1,cressie-read,0.666667,17.422055,3.0,0.000579,0.108866,0.952934
2,log-likelihood,0.0,17.356477,3.0,0.000597,0.108661,0.952186
3,freeman-tukey,-0.5,17.394341,3.0,0.000586,0.108779,0.952619
4,mod-log-likelihood,-1.0,17.50701,3.0,0.000556,0.109131,0.953886
5,neyman,-2.0,17.962352,3.0,0.000448,0.110541,0.958697


Attrition,No,Yes
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1
3,1044,200
4,189,37


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,0.000155,1.0,0.990075,0.000324,0.050018
1,cressie-read,0.666667,0.000155,1.0,0.990075,0.000324,0.050018
2,log-likelihood,0.0,0.000155,1.0,0.990076,0.000324,0.050018
3,freeman-tukey,-0.5,0.000155,1.0,0.990077,0.000324,0.050018
4,mod-log-likelihood,-1.0,0.000155,1.0,0.990078,0.000324,0.050018
5,neyman,-2.0,0.000155,1.0,0.99008,0.000324,0.050018


Attrition,No,Yes
RelationshipSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
1,219,57
2,258,45
3,388,71
4,368,64


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,5.241068,3.0,0.154972,0.059711,0.459687
1,cressie-read,0.666667,5.151907,3.0,0.161006,0.0592,0.452626
2,log-likelihood,0.0,4.986337,3.0,0.1728,0.058241,0.439414
3,freeman-tukey,-0.5,4.872546,3.0,0.181371,0.057573,0.430263
4,mod-log-likelihood,-1.0,4.766982,3.0,0.189677,0.056946,0.421724
5,neyman,-2.0,4.578356,3.0,0.205407,0.055808,0.406358


Attrition,No,Yes
StockOptionLevel,Unnamed: 1_level_1,Unnamed: 2_level_1
0,477,154
1,540,56
2,146,12
3,70,15


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,60.598301,3.0,4.37939e-13,0.203035,1.0
1,cressie-read,0.666667,60.722933,3.0,4.118909e-13,0.203244,1.0
2,log-likelihood,0.0,61.69301,3.0,2.555448e-13,0.204861,1.0
3,freeman-tukey,-0.5,63.102773,3.0,1.276724e-13,0.207188,1.0
4,mod-log-likelihood,-1.0,65.167963,3.0,4.617756e-14,0.210552,1.0
5,neyman,-2.0,71.64999,3.0,1.891863e-15,0.220775,1.0


Attrition,No,Yes
WorkLifeBalance,Unnamed: 1_level_1,Unnamed: 2_level_1
1,55,25
2,286,58
3,766,127
4,126,27


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,16.325097,3.0,0.000973,0.105383,0.938916
1,cressie-read,0.666667,15.47368,3.0,0.001454,0.102598,0.925542
2,log-likelihood,0.0,14.073324,3.0,0.002807,0.097845,0.897793
3,freeman-tukey,-0.5,13.240219,3.0,0.004145,0.094905,0.877285
4,mod-log-likelihood,-1.0,12.557144,3.0,0.005699,0.092424,0.857907
5,neyman,-2.0,11.544252,3.0,0.009119,0.088618,0.824424




Attrition,No,Yes
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1233.0,
Yes,,237.0


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,1462.614554,1.0,0.0,0.997485,1.0
1,cressie-read,0.666667,1282.801721,1.0,6.175441e-281,0.93416,1.0
2,log-likelihood,0.0,1282.612399,1.0,6.789077e-281,0.934091,1.0
3,freeman-tukey,-0.5,1807.910027,1.0,0.0,1.108995,1.0
4,mod-log-likelihood,-1.0,4257.224764,1.0,0.0,1.701785,1.0
5,neyman,-2.0,157473.527396,1.0,0.0,10.350113,1.0


Attrition,No,Yes
BusinessTravel,Unnamed: 1_level_1,Unnamed: 2_level_1
Non-Travel,138,12
Travel_Frequently,208,69
Travel_Rarely,887,156


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,24.182414,2.0,6e-06,0.12826,0.995516
1,cressie-read,0.666667,23.942197,2.0,6e-06,0.127621,0.99519
2,log-likelihood,0.0,23.760237,2.0,7e-06,0.127135,0.994928
3,freeman-tukey,-0.5,23.89847,2.0,6e-06,0.127505,0.995128
4,mod-log-likelihood,-1.0,24.29439,2.0,5e-06,0.128557,0.99566
5,neyman,-2.0,25.994898,2.0,2e-06,0.13298,0.997375


Attrition,No,Yes
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,51,12
Research & Development,828,133
Sales,354,92


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,10.796007,2.0,0.004526,0.085698,0.845592
1,cressie-read,0.666667,10.686054,2.0,0.004781,0.085261,0.841692
2,log-likelihood,0.0,10.490345,2.0,0.005273,0.084477,0.834544
3,freeman-tukey,-0.5,10.363646,2.0,0.005618,0.083965,0.829771
4,mod-log-likelihood,-1.0,10.253226,2.0,0.005937,0.083516,0.825517
5,neyman,-2.0,10.078297,2.0,0.006479,0.082801,0.818594




Attrition,No,Yes
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,20,7
Life Sciences,517,89
Marketing,124,35
Medical,401,63
Other,71,11
Technical Degree,100,32


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,16.024674,5.0,0.006774,0.104409,0.890712
1,cressie-read,0.666667,15.617228,5.0,0.008026,0.103073,0.881563
2,log-likelihood,0.0,14.900147,5.0,0.010798,0.100678,0.863876
3,freeman-tukey,-0.5,14.438741,5.0,0.01305,0.099107,0.85136
4,mod-log-likelihood,-1.0,14.035241,5.0,0.015387,0.097713,0.839645
5,neyman,-2.0,13.379343,5.0,0.020072,0.095402,0.818991


Attrition,No,Yes
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,501,87
Male,732,150


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,1.116967,1.0,0.290572,0.027565,0.184514
1,cressie-read,0.666667,1.119705,1.0,0.289982,0.027599,0.184853
2,log-likelihood,0.0,1.125434,1.0,0.288751,0.02767,0.18556
3,freeman-tukey,-0.5,1.129956,1.0,0.287785,0.027725,0.186119
4,mod-log-likelihood,-1.0,1.134673,1.0,0.286781,0.027783,0.186702
5,neyman,-2.0,1.144708,1.0,0.28466,0.027905,0.187942




Attrition,No,Yes
JobRole,Unnamed: 1_level_1,Unnamed: 2_level_1
Healthcare Representative,122,9
Human Resources,40,12
Laboratory Technician,197,62
Manager,97,5
Manufacturing Director,135,10
Research Director,78,2
Research Scientist,245,47
Sales Executive,269,57
Sales Representative,50,33


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,86.190254,8.0,2.752482e-15,0.242142,1.0
1,cressie-read,0.666667,85.838001,8.0,3.243442e-15,0.241647,1.0
2,log-likelihood,0.0,88.908721,8.0,7.743392e-16,0.245931,1.0
3,freeman-tukey,-0.5,95.201061,8.0,4.0708290000000006e-17,0.254485,1.0
4,mod-log-likelihood,-1.0,106.301702,8.0,2.187725e-19,0.268913,1.0
5,neyman,-2.0,155.087408,8.0,1.700565e-29,0.32481,1.0


Attrition,No,Yes
MaritalStatus,Unnamed: 1_level_1,Unnamed: 2_level_1
Divorced,294,33
Married,589,84
Single,350,120


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,46.163677,2.0,9.455511e-11,0.177211,0.999996
1,cressie-read,0.666667,45.30014,2.0,1.456128e-10,0.175546,0.999995
2,log-likelihood,0.0,43.999698,2.0,2.78989e-10,0.173008,0.999992
3,freeman-tukey,-0.5,43.373275,2.0,3.816041e-10,0.171772,0.99999
4,mod-log-likelihood,-1.0,43.029524,2.0,4.531663e-10,0.17109,0.999988
5,neyman,-2.0,43.161501,2.0,4.242278e-10,0.171352,0.999989


Attrition,No,Yes
OverTime,Unnamed: 1_level_1,Unnamed: 2_level_1
No,944,110
Yes,289,127


Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,87.564294,1.0,8.158424e-21,0.244065,1.0
1,cressie-read,0.666667,84.576205,1.0,3.6968769999999997e-20,0.239864,1.0
2,log-likelihood,0.0,80.079543,1.0,3.5963659999999997e-19,0.233401,1.0
3,freeman-tukey,-0.5,77.830595,1.0,1.122677e-18,0.2301,1.0
4,mod-log-likelihood,-1.0,76.421211,1.0,2.291765e-18,0.228007,1.0
5,neyman,-2.0,75.828553,1.0,3.093947e-18,0.227121,1.0


* 주목해야할 유의미한 컬럼: