# Data Preparation

In [1]:
#load libraries and datasets 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

employee_df = pickle.load(open('pickle_files/employee_df.pkl','rb'))

In [2]:
X_cat = employee_df[['BusinessTravel','Department','EducationField','Gender', 'JobRole', 'MaritalStatus']]
X_num = employee_df[['Age','DailyRate', 
       'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobSatisfaction',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']]

In [3]:
y = employee_df[['Attrition']]

## Feature Selection

### Selecting the numerical features 

In [4]:
from sklearn.feature_selection import SelectKBest, chi2

X_new = SelectKBest(chi2, k =10).fit(X_num,y)

In [5]:
df_scores = pd.DataFrame(X_new.scores_)
df_columns = pd.DataFrame(X_num.columns)
feature_scores = pd.concat([df_columns, df_scores],axis=1)
feature_scores.columns = ['Feature_Name','Score']  # name output columns


In [6]:
print(feature_scores.nlargest(10,'Score'))  # print 20 best features

            Feature_Name          Score
9          MonthlyIncome  127922.293694
10           MonthlyRate    1196.633553
1              DailyRate     956.580494
17     TotalWorkingYears     230.721618
20        YearsAtCompany     142.100054
21    YearsInCurrentRole     117.522596
23  YearsWithCurrManager     110.671534
0                    Age      84.155277
12              OverTime      63.845067
2       DistanceFromHome      63.772142


### Selecting the categorical features

In [7]:
X_cat = pd.get_dummies(X_cat)

In [8]:
X_new = SelectKBest(chi2, k =20).fit(X_cat,y)
df_scores = pd.DataFrame(X_new.scores_)
df_columns = pd.DataFrame(X_cat.columns)
feature_scores = pd.concat([df_columns, df_scores],axis=1)
feature_scores.columns = ['Feature_Name','Score']  # name output columns
print(feature_scores.nlargest(20,'Score'))


                         Feature_Name      Score
22       JobRole_Sales Representative  34.290268
25               MaritalStatus_Single  30.771669
1    BusinessTravel_Travel_Frequently  15.816623
16      JobRole_Laboratory Technician  11.699495
19          JobRole_Research Director  10.978010
17                    JobRole_Manager   9.496136
18     JobRole_Manufacturing Director   9.126589
23             MaritalStatus_Divorced   8.794422
14  JobRole_Healthcare Representative   8.292518
0           BusinessTravel_Non-Travel   7.317934
5                    Department_Sales   6.694465
24              MaritalStatus_Married   6.597586
11    EducationField_Technical Degree   6.435860
8            EducationField_Marketing   4.079154
4   Department_Research & Development   3.702916
9              EducationField_Medical   2.222133
6      EducationField_Human Resources   1.918878
15            JobRole_Human Resources   1.859753
2        BusinessTravel_Travel_Rarely   1.047857
7        EducationFi

### Creating the new dataset from selected variables

In [9]:
X = employee_df[['JobRole','MaritalStatus','BusinessTravel','Department','EducationField','MonthlyIncome','TotalWorkingYears','YearsAtCompany','YearsInCurrentRole','Age']]

In [10]:
X.tail(60)

Unnamed: 0,JobRole,MaritalStatus,BusinessTravel,Department,EducationField,MonthlyIncome,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,Age
1410,Sales Executive,Married,Travel_Rarely,Sales,Marketing,5677,15,11,8,40
1411,Human Resources,Married,Travel_Rarely,Human Resources,Human Resources,2187,6,2,0,25
1412,Laboratory Technician,Married,Travel_Rarely,Research & Development,Medical,3748,12,12,8,30
1413,Laboratory Technician,Divorced,Travel_Rarely,Research & Development,Other,3977,7,2,2,25
1414,Healthcare Representative,Single,Travel_Rarely,Research & Development,Medical,8633,25,17,14,47
1415,Laboratory Technician,Divorced,Non-Travel,Research & Development,Medical,2008,1,1,1,33
1416,Sales Executive,Married,Travel_Rarely,Sales,Life Sciences,4440,16,15,13,38
1417,Sales Representative,Married,Travel_Rarely,Sales,Life Sciences,3067,3,2,2,31
1418,Manufacturing Director,Married,Travel_Frequently,Research & Development,Life Sciences,5321,10,8,3,38
1419,Research Scientist,Divorced,Travel_Rarely,Research & Development,Life Sciences,5410,9,4,3,42


In [11]:
X_cat = employee_df[['JobRole','Department','BusinessTravel','EducationField','MaritalStatus']]
X_num = employee_df[['MonthlyIncome','TotalWorkingYears','YearsAtCompany','YearsInCurrentRole','Age']]

X_test = ['Research Scientist','Sales','Travel_Frequently','Other','Single']

X_test = {'JobRole':[X_test[0]],'MaritalStatus':[X_test[1]],'BusinessTravel':[X_test[2]],'Department':[X_test[3]],'EducationField':[X_test[4]]}

X_test = pd.DataFrame(X_test)

features = ['Research Scientist','Sales','Travel_Frequently','Other','Single',4500,1,2,3,25]
cat_features = features[0:5]
print(cat_features)
num_features = features[5:9]
X_test = {'JobRole':[cat_features[0]],'Department':[cat_features[1]],'BusinessTravel':[cat_features[2]],'EducationField':[cat_features[3]],'MaritalStatus':[cat_features[4]]}
X_test = pd.DataFrame(X_test)

X_test.head()

In [12]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
X_cat = encoder.fit_transform(X_cat)
X_cat =  pd.DataFrame(X_cat, columns = ['JobRole','Department','BusinessTravel','EducationField','MaritalStatus'])

X_test = encoder.transform(X_test)

In [13]:
X = pd.concat([X_cat, X_num], axis = 1)
X = pd.DataFrame(X)


In [14]:
y = employee_df[['Attrition']]

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test,  y_train, y_test = train_test_split(X,y,test_size = 0.25)

In [16]:
pickle.dump(X_cat,open('pickle_files/X_train_cat.pkl','wb'))
pickle.dump(X_num,open('pickle_files/X_train_num.pkl','wb'))
pickle.dump(X_train,open('pickle_files/X_train.pkl','wb'))
pickle.dump(y_train,open('pickle_files/y_train.pkl','wb'))
pickle.dump(X_test,open('pickle_files/X_test.pkl','wb'))
pickle.dump(y_test,open('pickle_files/y_test.pkl','wb'))

In [17]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [18]:
pickle.dump(scaler,open('pickle_files/scx.pkl','wb'))
pickle.dump(encoder,open('pickle_files/onec.pkl','wb'))