In [416]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
from numpy import std
from numpy import mean
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr

In [397]:
data_directory = "../data/"
TR_file = data_directory + "Train_HR_Employee_Attrition.csv"
TS_file = data_directory + "Test_HR_Employee_Attrition.csv"

In [398]:
df = pd.read_csv(TR_file) 

In [399]:
df.head()  # first rows sneak peek

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,52.0,No,Travel_Rarely,593,Research & Development,9,4,Medical,2,Male,...,3,80.0,0,20,2.0,2,8.0,7,2,13
1,36.0,No,Travel_Rarely,1218,Research & Development,1,1,Life Sciences,2,Male,...,3,80.0,1,21,3.0,3,1.0,8,1,6
2,,No,Travel_Frequently,530,Sales,16,3,Life Sciences,3,,...,3,,1,7,4.0,3,1.0,5,1,2
3,33.0,No,Travel_Rarely,953,Research & Development,5,4,Technical Degree,2,Male,...,2,,0,12,1.0,3,3.0,7,0,7
4,,No,Travel_Rarely,1380,Research & Development,9,2,Life Sciences,3,Female,...,3,80.0,0,2,,3,10.0,2,2,1


In [400]:
df.info()  # columns non-null elements and type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1000 non-null   float64
 1   Attrition                 1176 non-null   object 
 2   BusinessTravel            1069 non-null   object 
 3   DailyRate                 1176 non-null   int64  
 4   Department                1176 non-null   object 
 5   DistanceFromHome          1176 non-null   int64  
 6   Education                 1176 non-null   int64  
 7   EducationField            1176 non-null   object 
 8   EnvironmentSatisfaction   1176 non-null   int64  
 9   Gender                    1117 non-null   object 
 10  HourlyRate                1176 non-null   int64  
 11  JobInvolvement            1176 non-null   int64  
 12  JobLevel                  1176 non-null   int64  
 13  JobRole                   1176 non-null   object 
 14  JobSatis

<h6> Identify Rows That Contain Duplicate Data </h6>

In [401]:
# calculate duplicated rows
dups = df.duplicated()
# report if there are any duplicates
dups.any()

# report if there are any duplicates
# print(dups.any())

False

<h6> Idenotfy zero-variance predictors (columns which contains a single value) => no variation displayed by the predictor, so I'll remove them </h6>

In [402]:
# summarize the number of unique values in each column
counts = df.nunique()
zero_variance_predictors = [df.columns.values.tolist()[int(i)] for i, c in enumerate(counts) if c == 1]  
for column_name in zero_variance_predictors:
    print("zero_variance_predictors removed:", column_name, sep="\t")
    del df[column_name]  # remove zero_variance_predictors
df.shape

zero_variance_predictors removed:	Over18
zero_variance_predictors removed:	StandardHours


(1176, 31)

### N.B.
Over18 derived by column "Age" <br>
StandardHours derived by column "OverTime" (is specified StandardHours is equal to 80)

<h6> Indetify near-zero variance predictors (columns which contains dummies binary variables) => this predictors may
or may not contribute to the skill of a model</h6>
 Perhaps the unique values can be encoded as ordinal values? <br>
 Perhaps the unique values can be encoded as categorical values? <br>
 Perhaps compare model skill with each variable removed from the dataset?

In [403]:
# summarize the number of low variance values in each column (idealy binary of tridic dummy variables),
# by counting each column's unique values, (the low variance values will show as < 1%,
# when divinding the county by the rows number)
near_zero_variance_predictors = [i for i, c in enumerate(counts) if (float(c)/df.shape[0]*100) < 1]  
print("near_zero_variance_predictors = ", len(near_zero_variance_predictors))
print("near_zero_variance_predictors' unique values:")
for column_index in near_zero_variance_predictors:
    column_name = df.columns.values.tolist()[int(column_index)]
    print(column_name + ":", df.iloc[:, column_index].unique(), sep="\t")

near_zero_variance_predictors =  21
near_zero_variance_predictors' unique values:
Attrition:	['No' 'Yes']
BusinessTravel:	['Travel_Rarely' 'Travel_Frequently' nan 'Non-Travel']
Department:	['Research & Development' 'Sales' 'Human Resources']
Education:	[4 1 3 2 5]
EducationField:	['Medical' 'Life Sciences' 'Technical Degree' 'Other' 'Human Resources'
 'Marketing']
EnvironmentSatisfaction:	[2 3 4 1]
Gender:	['Male' nan 'Female']
JobInvolvement:	[3 4 1 2]
JobLevel:	[3 5 2 1 4]
JobRole:	['Research Director' 'Manager' 'Sales Executive' 'Research Scientist'
 'Laboratory Technician' 'Sales Representative' 'Manufacturing Director'
 'Healthcare Representative' 'Human Resources']
JobSatisfaction:	[3 4 2 1]
MaritalStatus:	['Single' 'Divorced' 'Married']
NumCompaniesWorked:	[9 1 2 0 3 8 7 4 5 6]
OverTime:	['No' 'Yes']
PercentSalaryHike:	[17 14 25 12 11 18 20 13 21 15 19 22 16 24 23]
RelationshipSatisfaction:	[3 2 4 1]
StockOptionLevel:	[0 1 3 2]
TotalWorkingYears:	[20 21  7 12  2  4  6 13 19  1 1

<h6> Encode object values as categorical values (they are actually all near-zero variance predictors in our case) </h6>

In [404]:
data_types= df.dtypes
print("Object values encodings:")
for column_name, column_type in dataTypeDict.items():
    if column_type == np.object:
        column_index = df.columns.get_loc(column_name)
        unique_values = df.iloc[:, column_index].unique()
        replacement_dict = {}
        
        if column_name == "BusinessTravel":
            replacement_dict = {"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
        elif column_name == "MaritalStatus":
            replacement_dict = {"Single": 0, "Married": 1, "Divorced": 2}
        else:
            i = 0
            other_flag = False
            for v in unique_values:
                if pd.isnull(v) is False:
                    if v == "Other":
                        other_flag = True
                    else:
                        replacement_dict[str(v)] = int(i)
                        i += 1
            if other_flag:
                replacement_dict["Other"] = i
        print(replacement_dict)
        df[column_name].replace(replacement_dict, inplace=True)

Object values encodings:
{'No': 0, 'Yes': 1}
{'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}
{'Research & Development': 0, 'Sales': 1, 'Human Resources': 2}
{'Medical': 0, 'Life Sciences': 1, 'Technical Degree': 2, 'Human Resources': 3, 'Marketing': 4, 'Other': 5}
{'Male': 0, 'Female': 1}
{'Research Director': 0, 'Manager': 1, 'Sales Executive': 2, 'Research Scientist': 3, 'Laboratory Technician': 4, 'Sales Representative': 5, 'Manufacturing Director': 6, 'Healthcare Representative': 7, 'Human Resources': 8}
{'Single': 0, 'Married': 1, 'Divorced': 2}
{'No': 0, 'Yes': 1}


<h6> Analyzing variables distributions </h6>

In [None]:
def draw_hist(x, x_label, y_label, title, pngfile):
    

In [None]:
column_names = list(df.columns)
print("column_name" , "mean", "std", "outliers", sep="\t")
for column_name in column_names:
    data = df[column_name].dropna()  # get each column's values, omitting NaN values 
    draw_hist(data, x_label, y_label, title, pngfile)

<h6> Outliers removing </h6>

<h6> Standard Deviation Method (for Guassian distributed values) </h6>

In [433]:
# calculate summary statistics
column_names = list(df.columns)
print("column_name" , "mean", "std", "outliers", sep="\t")
for column_name in column_names:
    data = df[column_name].dropna()  # get each column's values, omitting NaN values 
    data_mean, data_std = mean(data), std(data)
    # define outliers as values, three standard deviations from the mean (99.7%)
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    # identify outliers
    outliers = [x for x in data if x < lower or x > upper]
    if  len(outliers) > 0:
        print(column_name , data_mean, data_std, outliers, sep="\t")

column_name	mean	std	outliers
TotalWorkingYears	11.019557823129253	7.6915753768177355	[37, 40, 35, 37, 36, 36, 36, 36, 37, 37, 35, 36, 36]
YearsAtCompany	6.926523297491039	6.0604759053458945	[34.0, 31.0, 31.0, 31.0, 29.0, 32.0, 40.0, 33.0, 26.0, 32.0, 37.0, 36.0, 26.0, 30.0, 33.0, 33.0, 33.0]
YearsInCurrentRole	4.188775510204081	3.635858337114303	[17, 16, 18, 16, 17, 16, 16, 16, 16, 16]
YearsSinceLastPromotion	2.171768707482993	3.1884280989889096	[14, 14, 12, 13, 15, 15, 15, 12, 13, 14, 15, 13, 13, 13, 14, 15, 15, 12, 14, 13, 14, 12, 15, 13, 12, 15, 14, 13, 12, 14, 13]
YearsWithCurrManager	4.107993197278912	3.599565929984835	[15, 15, 17, 15, 15, 16, 17, 17, 17, 17, 16]


<h6> Interquartile Range Method (non-Gaussian distributed values) </h6>

<h6> Replacing missing values </h6>

In [405]:
null_columns = df.isnull().sum()
nan_columns = []
print("num_null_columns = ", len(null_columns), sep="\t")
print("null_columns:\n")
for column_name, missing_values in null_columns.items():
    if missing_values > 0:
        print(column_name, missing_values, sep="\t")
        nan_columns.append(column_name)

num_null_columns = 	31
null_columns:

Age	176
BusinessTravel	107
Gender	59
MonthlyIncome	213
PerformanceRating	138
TrainingTimesLastYear	233
YearsAtCompany	60


@TODO: continue

In [383]:
df.describe()  # columns statistics

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1000.0,1176.0,1069.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1117.0,...,1038.0,1176.0,1176.0,1176.0,943.0,1176.0,1116.0,1176.0,1176.0,1176.0
mean,37.199,0.163265,1.073901,803.65051,0.385204,9.210034,2.884354,1.348639,2.715986,0.405551,...,3.152216,2.702381,0.783163,11.019558,2.827147,2.755952,6.926523,4.188776,2.171769,4.107993
std,9.015802,0.369765,0.529258,406.683045,0.561535,8.097024,1.016574,1.478083,1.088876,0.491218,...,0.359403,1.092268,0.851385,7.694848,1.27312,0.707984,6.063193,3.637405,3.189785,3.601097
min,18.0,0.0,0.0,102.0,0.0,1.0,1.0,0.0,1.0,0.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,1.0,460.5,0.0,2.0,2.0,0.0,2.0,0.0,...,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,1.0,804.0,0.0,7.0,3.0,1.0,3.0,0.0,...,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,0.0,1.0,1169.0,1.0,14.0,4.0,2.0,4.0,1.0,...,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1.0,2.0,1499.0,2.0,29.0,5.0,5.0,4.0,1.0,...,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


<h2> Correlation </h2>

<h6> Person correlation coefficient </h6> <br>
$\rho_{xy} = \dfrac{Cov(x, y)}{\sigma_{x}, \sigma_{y}}$ <br>
where: <br>
- $Cov(x, y)$ is the covariance between variable x and y; <br>
- $\sigma_{x}$ and $\sigma_{y}$ are variables' standard deviations. <br>

Values always range between -1 (strong negative relationship) and +1 (strong positive relationship). Values at or close to zero imply weak or no linear relationship.

In [37]:
df.corr("pearson") # pearson correlation

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.027961,0.008081,0.005951,0.026259,0.045007,0.005358,0.040556,-0.056664,0.496275,...,0.028624,,0.033101,0.023891,-0.017973,0.011215,0.330892,0.039994,0.006215,0.012462
DailyRate,-0.027961,1.0,0.014447,-0.014759,0.047369,0.007709,0.049508,-0.009872,0.044478,-0.030414,...,0.002869,,0.040397,0.015554,-0.018077,-0.046024,-0.016744,0.004506,-0.032954,-0.015054
DistanceFromHome,0.008081,0.014447,1.0,0.021564,0.00098,0.037306,-0.009932,0.020217,0.004533,0.00069,...,0.013618,,0.072908,0.00869,-0.00652,-0.02683,0.044862,0.014401,0.004995,0.006839
Education,0.005951,-0.014759,0.021564,1.0,-0.054301,0.007589,0.039782,0.086789,-0.036841,0.008964,...,0.004234,,0.001485,0.140639,-0.006407,0.02579,0.022576,0.053092,0.032377,0.066882
EnvironmentSatisfaction,0.026259,0.047369,0.00098,-0.054301,1.0,-0.04389,-0.007997,0.036607,-0.015897,-0.053153,...,0.016884,,0.020726,0.030425,-0.008389,0.034762,-0.016126,0.030739,0.036111,0.014774
HourlyRate,0.045007,0.007709,0.037306,0.007589,-0.04389,1.0,0.03284,-0.043203,-0.088872,-0.01453,...,0.004297,,0.026602,-0.017562,-0.013365,-0.015012,0.007165,-0.027806,-0.038817,-0.025352
JobInvolvement,0.005358,0.049508,-0.009932,0.039782,-0.007997,0.03284,1.0,-0.022649,-0.025413,0.011389,...,0.054873,,0.042657,-0.022687,-0.023995,-0.026684,-0.02896,0.002518,-0.032253,0.022631
JobLevel,0.040556,-0.009872,0.020217,0.086789,0.036607,-0.043203,-0.022649,1.0,-0.007627,0.014883,...,-0.009148,,0.003197,0.774492,-0.001041,0.03158,0.014837,0.402092,0.348876,0.39179
JobSatisfaction,-0.056664,0.044478,0.004533,-0.036841,-0.015897,-0.088872,-0.025413,-0.007627,1.0,0.019431,...,-0.024167,,-0.0117,-0.025719,0.04176,-0.032101,-0.012542,0.002989,-0.034122,-0.032654
MonthlyIncome,0.496275,-0.030414,0.00069,0.008964,-0.053153,-0.01453,0.011389,0.014883,0.019431,1.0,...,-0.019264,,-0.032897,-0.005489,0.013194,0.027635,0.513209,0.006859,-0.021624,-0.023804


<h6> Sperarman correlation coefficient </h6> <br>
$r = \rho_{rg_{x}rg_{y}} = \dfrac{Cov(rg_{x}, rg_{y})}{\sigma_{rg_{x}}, \sigma_{rg_{y}}}$ <br>
where: <br>
- $rg_{x}$ and $rg_{y}$ are variable's x and y rankings; <br>
- $Cov(rg_{x}, rg_{y})$ is rankings' covariance; <br>
- $\sigma_{rg_{x}}$ and $\sigma_{rg_{y}}$ are rankings' standard deviations. <br>

Values always range between -1 (strong negative relationship) and +1 (strong positive relationship). Values at or close to zero imply weak or no relationship.

In [38]:
df.corr("spearman")  # spearman correlation

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.036965,-0.000851,0.004444,0.026231,0.035865,-0.0005,0.049527,-0.058197,0.490325,...,0.022757,,0.015047,0.042117,0.005793,0.0149,0.262058,0.052693,-0.00619,0.025985
DailyRate,-0.036965,1.0,0.015002,-0.011829,0.047641,0.007962,0.047062,0.00194,0.042839,-0.002856,...,0.005834,,0.031334,0.027126,-0.020247,-0.051521,-0.017722,0.002314,-0.039053,0.003623
DistanceFromHome,-0.000851,0.015002,1.0,0.012412,0.007325,0.033866,0.010069,0.028471,-0.005883,-0.051726,...,0.013857,,0.059409,-0.002116,-0.028747,-0.012463,-0.00706,0.005639,-0.016885,-0.00752
Education,0.004444,-0.011829,0.012412,1.0,-0.056361,0.006465,0.033177,0.091429,-0.030685,-0.000722,...,-0.003219,,-0.006269,0.15273,-0.003441,0.032127,0.009163,0.05508,0.016847,0.054961
EnvironmentSatisfaction,0.026231,0.047641,0.007325,-0.056361,1.0,-0.048333,-0.015574,0.022812,-0.010098,-0.046849,...,0.014275,,0.026173,0.010198,-0.002576,0.037566,-0.013881,0.023721,0.032848,0.015152
HourlyRate,0.035865,0.007962,0.033866,0.006465,-0.048333,1.0,0.034801,-0.046816,-0.08511,-0.020359,...,0.003246,,0.020598,-0.021129,-0.030178,-0.023704,0.024465,-0.041148,-0.066429,-0.024488
JobInvolvement,-0.0005,0.047062,0.010069,0.033177,-0.015574,0.034801,1.0,-0.02536,-0.019122,-0.008752,...,0.054147,,0.053401,-0.004387,-0.023579,-0.033537,-0.03136,0.015284,-0.007876,0.034067
JobLevel,0.049527,0.00194,0.028471,0.091429,0.022812,-0.046816,-0.02536,1.0,-0.005741,0.006889,...,-0.018697,,0.04155,0.728532,0.007078,0.033761,0.010556,0.405445,0.269422,0.382773
JobSatisfaction,-0.058197,0.042839,-0.005883,-0.030685,-0.010098,-0.08511,-0.019122,-0.005741,1.0,0.007257,...,-0.026042,,-0.009334,-0.01958,0.036754,-0.040886,-0.027073,0.011332,0.013585,-0.014596
MonthlyIncome,0.490325,-0.002856,-0.051726,-0.000722,-0.046849,-0.020359,-0.008752,0.006889,0.007257,1.0,...,-0.00596,,-0.057848,0.007956,-0.005433,0.027256,0.466998,-0.003083,-0.01796,-0.019043


<h6> Kendall correlation coefficient </h6> <br>
$\tau_{xy} = \dfrac{n_{c} - n_{d}}{n(n - 1) / 2}$ <br>
where: <br>
- $n_{c}$ and $n_{d}$ are the number of concordan or discordant pairs $(x_{i}, y_{i})$, i=1,...,n; <br>
- $n(n - 1) / 2$ is the total number of pairings. <br>

Values always range between -1 (strong negative relationship) and +1 (strong positive relationship). Values at or close to zero imply weak or no relationship.

In [39]:
df.corr("kendall")  # kendall correlation

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.024706,-0.00071,0.00351,0.019627,0.024958,-0.000446,0.037787,-0.043486,0.346631,...,0.016985,,0.011376,0.029634,0.005047,0.01111,0.192403,0.038305,-0.004435,0.018457
DailyRate,-0.024706,1.0,0.010136,-0.00869,0.035774,0.005659,0.036792,0.001469,0.031775,-0.000978,...,0.004106,,0.024293,0.018779,-0.01587,-0.040005,-0.011894,0.001373,-0.028271,0.002252
DistanceFromHome,-0.00071,0.010136,1.0,0.009847,0.005731,0.024087,0.008202,0.022821,-0.004455,-0.036369,...,0.010615,,0.047007,-0.000817,-0.022331,-0.009914,-0.004695,0.004236,-0.01263,-0.005538
Education,0.00351,-0.00869,0.009847,1.0,-0.047294,0.004969,0.028904,0.077564,-0.025655,-0.000562,...,-0.002615,,-0.005555,0.117324,-0.003054,0.027906,0.007023,0.043763,0.01368,0.043425
EnvironmentSatisfaction,0.019627,0.035774,0.005731,-0.047294,1.0,-0.036222,-0.013634,0.019585,-0.008692,-0.035201,...,0.012021,,0.022721,0.008412,-0.002165,0.032852,-0.010448,0.018718,0.026963,0.011901
HourlyRate,0.024958,0.005659,0.024087,0.004969,-0.036222,1.0,0.027,-0.03563,-0.064327,-0.013624,...,0.00274,,0.016399,-0.014251,-0.022608,-0.018703,0.016906,-0.028387,-0.049362,-0.017392
JobInvolvement,-0.000446,0.036792,0.008202,0.028904,-0.013634,0.027,1.0,-0.022341,-0.016605,-0.00653,...,0.047279,,0.048398,-0.003555,-0.020289,-0.030123,-0.02551,0.012539,-0.006771,0.028365
JobLevel,0.037787,0.001469,0.022821,0.077564,0.019585,-0.03563,-0.022341,1.0,-0.004797,0.00516,...,-0.015713,,0.035435,0.610382,0.005904,0.029556,0.008112,0.334771,0.225035,0.315104
JobSatisfaction,-0.043486,0.031775,-0.004455,-0.025655,-0.008692,-0.064327,-0.016605,-0.004797,1.0,0.005291,...,-0.021624,,-0.00808,-0.014959,0.030681,-0.035664,-0.021249,0.00903,0.011371,-0.011525
MonthlyIncome,0.346631,-0.000978,-0.036369,-0.000562,-0.035201,-0.013624,-0.00653,0.00516,0.005291,1.0,...,-0.004098,,-0.045054,0.006028,-0.003703,0.021279,0.341888,-0.001825,-0.013216,-0.012057
