In [78]:
# import the libraries

import pandas as pd

In [79]:
# read the file to a dataframe

df = pd.read_csv('IBM_HR_Employee_Attrition_removed_data.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,Yes,Travel_Rarely,1102.0,Sales,1.0,2.0,Life Sciences,,1,...,1.0,80.0,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0
1,49.0,No,Travel_Frequently,279.0,Research & Development,8.0,1.0,Life Sciences,,2,...,4.0,80.0,1.0,10.0,3.0,3.0,10.0,7.0,1.0,7.0
2,37.0,Yes,Travel_Rarely,1373.0,Research & Development,2.0,2.0,Other,,4,...,2.0,80.0,0.0,7.0,3.0,3.0,0.0,0.0,0.0,0.0
3,33.0,No,Travel_Frequently,1392.0,Research & Development,3.0,4.0,Life Sciences,,5,...,3.0,80.0,0.0,8.0,3.0,3.0,8.0,7.0,3.0,0.0
4,27.0,No,Travel_Rarely,591.0,Research & Development,2.0,1.0,Medical,,7,...,4.0,80.0,1.0,6.0,3.0,3.0,2.0,2.0,2.0,2.0


In [80]:
# Find empty columns for evaluation

complete_null_columns = df.columns[df.isna().all()].tolist()
complete_null_columns


['EmployeeCount']

##### Given the entire column is empty there is no value in keeping the column. By implication it means we can safely remove the entire column. 

In [81]:
df.drop(['EmployeeCount'], axis='columns', inplace=True)

In [82]:
df.shape[1]

34

In [89]:
null_values = df.isnull().sum()
null_values

Age                          0
Attrition                    0
BusinessTravel               0
DailyRate                    0
Department                   0
DistanceFromHome             0
Education                    0
EducationField               0
EmployeeNumber               0
EnvironmentSatisfaction      0
Gender                       0
HourlyRate                   0
JobInvolvement               0
JobLevel                     0
JobRole                      0
JobSatisfaction              0
MaritalStatus               15
MonthlyIncome                0
MonthlyRate                  0
NumCompaniesWorked           0
Over18                       0
OverTime                     0
PercentSalaryHike           19
PerformanceRating            0
RelationshipSatisfaction     0
StandardHours                0
StockOptionLevel             0
TotalWorkingYears            0
TrainingTimesLastYear        0
WorkLifeBalance              0
YearsAtCompany               0
YearsInCurrentRole           0
YearsSin

##### From this we can see that 'MaritalStatus' is missing a few values, there are also some missing values for the 'PercentSalaryHike' column. 
##### The other columns seem to be consistently missing 2 values. Lets see if we can identify the rows responsible. 

In [84]:
nan_rows_ = df.isnull().sum(axis=1) 
nan_rows

0       0
1       0
2       0
3       0
4       0
       ..
1465    0
1466    0
1467    0
1468    0
1469    0
Length: 1470, dtype: int64

In [85]:
grtr_2_index = df.loc[df.isnull().sum(1)>2].index

In [86]:
rows_missing_grtr_2 = df.loc[df.isnull().sum(1)>2]
rows_missing_grtr_2

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
14,,,,,,,,,19,,...,,,,,,,,,,
185,,,,,,,,,252,,...,,,,,,,,,,


##### From this we can see that no data was recorded for these employee numbers so it's safe to remove the rows

In [87]:
df.drop(df.index[grtr_2_index], inplace=True)

In [91]:
null_values = df.isnull().sum()
null_values

Age                          0
Attrition                    0
BusinessTravel               0
DailyRate                    0
Department                   0
DistanceFromHome             0
Education                    0
EducationField               0
EmployeeNumber               0
EnvironmentSatisfaction      0
Gender                       0
HourlyRate                   0
JobInvolvement               0
JobLevel                     0
JobRole                      0
JobSatisfaction              0
MaritalStatus               15
MonthlyIncome                0
MonthlyRate                  0
NumCompaniesWorked           0
Over18                       0
OverTime                     0
PercentSalaryHike           19
PerformanceRating            0
RelationshipSatisfaction     0
StandardHours                0
StockOptionLevel             0
TotalWorkingYears            0
TrainingTimesLastYear        0
WorkLifeBalance              0
YearsAtCompany               0
YearsInCurrentRole           0
YearsSin

##### Lets fill the missing 'MaritalStatus' values with 'Unknown'

In [92]:
df.MaritalStatus.fillna(value="Complicated", inplace=True)

In [93]:
null_values = df.isnull().sum()
null_values

Age                          0
Attrition                    0
BusinessTravel               0
DailyRate                    0
Department                   0
DistanceFromHome             0
Education                    0
EducationField               0
EmployeeNumber               0
EnvironmentSatisfaction      0
Gender                       0
HourlyRate                   0
JobInvolvement               0
JobLevel                     0
JobRole                      0
JobSatisfaction              0
MaritalStatus                0
MonthlyIncome                0
MonthlyRate                  0
NumCompaniesWorked           0
Over18                       0
OverTime                     0
PercentSalaryHike           19
PerformanceRating            0
RelationshipSatisfaction     0
StandardHours                0
StockOptionLevel             0
TotalWorkingYears            0
TrainingTimesLastYear        0
WorkLifeBalance              0
YearsAtCompany               0
YearsInCurrentRole           0
YearsSin

In [109]:
# what is the percentage of missing data on the 'PercentageSalaryHike' column

missing_sal_hike = df.PercentSalaryHike.isna().sum()
total_sal_hike = df.PercentSalaryHike.sum()
missing_val_ratio = (missing_sal_hike / total_sal_hike) * 100


print("For the 'PercentageSalaryHike' column we have {} total records with a total of {} missing values. This provides us with a completenes ratio of {}%" 
      .format(int(total_sal_hike), missing_sal_hike, round(missing_val_ratio, 2)))

For the 'PercentageSalaryHike' column we have 22047 total records with a total of 19 missing values. This provides us with a completenes ratio of 0.09%


##### This does not seem significant and we can probably get away with imputig an average value

In [123]:
df['PercentSalaryHike'].fillna(df['PercentSalaryHike'].mode()[0], inplace=True)

In [124]:
null_values = df.isnull().sum()
null_values

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithC

### The dataset should now be clear of any missing values and ready for the model to be used. 