In [1]:
# importing libraries for data processing
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# sklearn modules for preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# sklearn modules for ML model selection
from sklearn.model_selection import train_test_split  # import 'train_test_split'

In [3]:
#Importing raw data
df_hr_data = pd.read_csv('C:/Users/Admin/IABAC_Project/data/raw/INX_Future_Inc_Employee_Performance.csv')
df_hr_data.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3


In [4]:
#To check the attributes of the data.
df_hr_data.columns

Index(['EmpNumber', 'Age', 'Gender', 'EducationBackground', 'MaritalStatus',
       'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency',
       'DistanceFromHome', 'EmpEducationLevel', 'EmpEnvironmentSatisfaction',
       'EmpHourlyRate', 'EmpJobInvolvement', 'EmpJobLevel',
       'EmpJobSatisfaction', 'NumCompaniesWorked', 'OverTime',
       'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction',
       'TotalWorkExperienceInYears', 'TrainingTimesLastYear',
       'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany',
       'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'PerformanceRating'],
      dtype='object')

In [5]:
#To check the total number of records (rows) & total number of attributes (columns) in the given dataset.
df_hr_data.shape

(1200, 28)

In [6]:
# To break down the columns by their type (i.e. int64, float64, object)
df_hr_data.columns.to_series().groupby(df_hr_data.dtypes).groups

{dtype('int64'): Index(['Age', 'DistanceFromHome', 'EmpEducationLevel',
        'EmpEnvironmentSatisfaction', 'EmpHourlyRate', 'EmpJobInvolvement',
        'EmpJobLevel', 'EmpJobSatisfaction', 'NumCompaniesWorked',
        'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction',
        'TotalWorkExperienceInYears', 'TrainingTimesLastYear',
        'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany',
        'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion',
        'YearsWithCurrManager', 'PerformanceRating'],
       dtype='object'),
 dtype('O'): Index(['EmpNumber', 'Gender', 'EducationBackground', 'MaritalStatus',
        'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency', 'OverTime',
        'Attrition'],
       dtype='object')}

In [7]:
# To check Column datatypes and presence of missign values
df_hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 28 columns):
EmpNumber                       1200 non-null object
Age                             1200 non-null int64
Gender                          1200 non-null object
EducationBackground             1200 non-null object
MaritalStatus                   1200 non-null object
EmpDepartment                   1200 non-null object
EmpJobRole                      1200 non-null object
BusinessTravelFrequency         1200 non-null object
DistanceFromHome                1200 non-null int64
EmpEducationLevel               1200 non-null int64
EmpEnvironmentSatisfaction      1200 non-null int64
EmpHourlyRate                   1200 non-null int64
EmpJobInvolvement               1200 non-null int64
EmpJobLevel                     1200 non-null int64
EmpJobSatisfaction              1200 non-null int64
NumCompaniesWorked              1200 non-null int64
OverTime                        1200 non-null object
E

In [8]:
# Implementing one hot coding, to change the categorical variable to numerical variable.
df_temp = df_hr_data.drop(['NumCompaniesWorked', 'Attrition', 'EmpJobRole', 'YearsWithCurrManager', 'TotalWorkExperienceInYears', 'ExperienceYearsAtThisCompany', 'Age', 'EmpHourlyRate', 'TrainingTimesLastYear', 'BusinessTravelFrequency'], axis = 1)
df_temp.head()

Unnamed: 0,EmpNumber,Gender,EducationBackground,MaritalStatus,EmpDepartment,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,OverTime,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,EmpWorkLifeBalance,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,PerformanceRating
0,E1001000,Male,Marketing,Single,Sales,10,3,4,3,2,4,No,12,4,2,7,0,3
1,E1001006,Male,Marketing,Single,Sales,14,4,4,3,2,1,No,12,4,3,7,1,3
2,E1001007,Male,Life Sciences,Married,Sales,5,4,4,2,3,1,Yes,21,3,3,13,1,4
3,E1001009,Male,Human Resources,Divorced,Human Resources,10,4,2,2,5,4,No,15,2,2,6,12,3
4,E1001010,Male,Marketing,Single,Sales,16,4,1,3,2,1,No,14,4,3,2,2,3


In [9]:
#one hot coding the categorical variables, to convert categorical variables to numerical form.
df_clean_data = pd.get_dummies(df_temp, columns=["Gender", "MaritalStatus", "OverTime", "EducationBackground",  "EmpDepartment"])
df_clean_data.shape

(1200, 32)

In [10]:
df_clean_data.to_csv(r'C:/Users/Admin/IABAC_Project/data/processed/INX_Future_Inc_Employee_Performance_cleaned_data.csv', index = False)
df_clean_data.head()

Unnamed: 0,EmpNumber,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,EmpWorkLifeBalance,...,EducationBackground_Marketing,EducationBackground_Medical,EducationBackground_Other,EducationBackground_Technical Degree,EmpDepartment_Data Science,EmpDepartment_Development,EmpDepartment_Finance,EmpDepartment_Human Resources,EmpDepartment_Research & Development,EmpDepartment_Sales
0,E1001000,10,3,4,3,2,4,12,4,2,...,1,0,0,0,0,0,0,0,0,1
1,E1001006,14,4,4,3,2,1,12,4,3,...,1,0,0,0,0,0,0,0,0,1
2,E1001007,5,4,4,2,3,1,21,3,3,...,0,0,0,0,0,0,0,0,0,1
3,E1001009,10,4,2,2,5,4,15,2,2,...,0,0,0,0,0,0,0,1,0,0
4,E1001010,16,4,1,3,2,1,14,4,3,...,1,0,0,0,0,0,0,0,0,1
