In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

Load the data

In [3]:
candidate_processed = pd.read_csv('../data/processed/candidate_labeled.csv')
candidate_processed.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,success,early_exit
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,0,8,0,1,6,4,0,5,0,0
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,1,10,3,3,10,7,1,7,0,0
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,0,7,3,3,0,0,0,0,0,1
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,0,8,3,3,8,7,3,0,1,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,1,6,3,3,2,2,2,2,0,0


Handle Missing Values

In [4]:
candidate_processed.isnull().sum().sort_values(ascending=False)

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

Stability Index

In [18]:
candidate_processed['stability_index'] = candidate_processed['YearsAtCompany'] / (candidate_processed['TotalWorkingYears']+1)
candidate_processed['stability_index']

0       0.666667
1       0.909091
2       0.000000
3       0.888889
4       0.285714
          ...   
1465    0.277778
1466    0.700000
1467    0.857143
1468    0.500000
1469    0.571429
Name: stability_index, Length: 1470, dtype: float64

Promotion Velocity

In [7]:
candidate_processed['promotion_velocity'] = candidate_processed['YearsAtCompany'] / (candidate_processed['YearsSinceLastPromotion'] + 1)
candidate_processed['promotion_velocity']


0       6.000000
1       5.000000
2       0.000000
3       2.000000
4       0.666667
          ...   
1465    5.000000
1466    3.500000
1467    6.000000
1468    9.000000
1469    2.000000
Name: promotion_velocity, Length: 1470, dtype: float64

Role Stagnation Flag

In [8]:
candidate_processed['role_stagnation'] = (candidate_processed['YearsInCurrentRole'] > 5).astype(int)
candidate_processed['role_stagnation']

0       0
1       1
2       0
3       1
4       0
       ..
1465    0
1466    1
1467    0
1468    1
1469    0
Name: role_stagnation, Length: 1470, dtype: int64

Work-Life Burnout Risk

In [9]:
candidate_processed['burnout_risk'] = ((candidate_processed['OverTime'] == 1) & (candidate_processed['WorkLifeBalance'] <= 2)).astype(int)
candidate_processed['burnout_risk']

0       0
1       0
2       0
3       0
4       0
       ..
1465    0
1466    0
1467    0
1468    0
1469    0
Name: burnout_risk, Length: 1470, dtype: int64

Career Growth Score

In [11]:
candidate_processed['career_growth_score'] = (candidate_processed['TrainingTimesLastYear'] + candidate_processed['NumCompaniesWorked'] + candidate_processed['TotalWorkingYears'])
candidate_processed['career_growth_score'] 

0       16
1       14
2       16
3       12
4       18
        ..
1465    24
1466    18
1467     7
1468    22
1469    11
Name: career_growth_score, Length: 1470, dtype: int64

Encode Categorical Features

In [13]:
le = LabelEncoder()

cat_cols = candidate_processed.select_dtypes(include=['object']).columns
for col in cat_cols:
    candidate_processed[col] = le.fit_transform(candidate_processed[col])

Feature Scaling

In [20]:
scaler = StandardScaler()

scale_cols = [
    'Age','MonthlyIncome','TotalWorkingYears','YearsAtCompany',
    'YearsInCurrentRole','YearsSinceLastPromotion',
    'stability_index','promotion_velocity','career_growth_score'
]

candidate_processed[scale_cols] = scaler.fit_transform(candidate_processed[scale_cols])

Final Feature Selection

In [24]:
x = candidate_processed.drop(['success','EmployeeNumber'], axis=1)
y = candidate_processed['success']

Save Clean Dataset

In [25]:
x.to_csv("../data/processed/candidate_features_ready.csv", index=False) 
y.to_csv("../data/processed/candidate_labels.csv", index=False)