# EDA + Preprocessing + Pipeline + Submission 🔥

In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
%matplotlib inline
pd.options.display.max_columns = 40

import warnings
warnings.simplefilter("ignore")


# EDA

In [2]:
# Reading the given input files
train = pd.read_csv("/kaggle/input/playground-series-s3e3/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s3e3/test.csv")
sub = pd.read_csv("/kaggle/input/playground-series-s3e3/sample_submission.csv")

In [3]:
train.head(3)

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,Male,42,3,1,Laboratory Technician,4,Married,2596,5099,1,Y,Yes,13,3,2,80,1,10,2,3,10,0,7,8,0
1,1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,Male,46,3,1,Sales Representative,1,Married,2899,10778,1,Y,No,17,3,4,80,1,4,3,3,4,2,0,3,0
2,2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,Male,80,3,2,Sales Executive,4,Divorced,4627,16495,0,Y,No,17,3,4,80,2,4,3,3,3,2,1,2,0


In [4]:
test.head(3)

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1677,19,Non-Travel,992,Research & Development,1,1,Medical,1,4,Male,43,3,1,Laboratory Technician,3,Single,2318,17778,1,Y,No,12,3,4,80,0,1,2,2,1,0,0,0
1,1678,45,Travel_Rarely,1136,Sales,4,4,Marketing,1,3,Male,67,3,2,Sales Executive,1,Divorced,5486,12421,6,Y,Yes,12,3,3,80,1,7,3,3,2,2,2,2
2,1679,37,Travel_Rarely,155,Research & Development,13,3,Life Sciences,1,4,Male,41,3,1,Research Scientist,4,Divorced,2741,23577,4,Y,Yes,13,3,2,80,2,13,2,2,7,7,1,7


In [5]:
sub.head(3)

Unnamed: 0,id,Attrition
0,1677,0.119261
1,1678,0.119261
2,1679,0.119261


In [6]:
#Id and Employee number in the original dataset are like cardinal value so we are removing them
train_extra = train.drop("id", axis = 1)

We are going to add original dataset beacause it really improving our model.

In [7]:
#Adding the original dataset and making the dataset like our competion dataset adding it
original = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
original = original.drop('EmployeeNumber', axis=1)
original["Attrition"] = original["Attrition"].apply(lambda x : 1 if x == "Yes" else 0)

In [8]:
new_train = pd.concat([original, train_extra], axis =0, ignore_index = True)

In [9]:
new_train

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,30,0,Travel_Rarely,945,Sales,1,3,Life Sciences,1,4,Female,73,3,3,Sales Executive,3,Single,8722,14255,1,Y,No,19,3,2,80,0,10,2,4,10,0,0,8
3143,32,0,Travel_Rarely,1303,Research & Development,2,3,Life Sciences,1,1,Male,48,3,1,Research Scientist,2,Married,3544,15972,4,Y,No,19,3,4,80,1,10,3,4,4,2,1,3
3144,29,1,Travel_Frequently,1184,Human Resources,24,3,Human Resources,1,2,Male,36,2,1,Human Resources,1,Married,2804,15322,1,Y,Yes,11,3,3,80,0,1,2,3,1,0,0,0
3145,36,0,Travel_Rarely,441,Sales,9,2,Marketing,1,2,Male,48,4,2,Sales Executive,3,Divorced,5406,4051,1,Y,No,21,4,3,80,2,10,3,2,10,3,0,8


# EDA

In [10]:
#Finding Missing values
new_train.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithC

In [11]:
#Finding Missing values
test.isna().sum()

id                          0
Age                         0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithC

In [12]:
new_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3147 entries, 0 to 3146
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       3147 non-null   int64 
 1   Attrition                 3147 non-null   int64 
 2   BusinessTravel            3147 non-null   object
 3   DailyRate                 3147 non-null   int64 
 4   Department                3147 non-null   object
 5   DistanceFromHome          3147 non-null   int64 
 6   Education                 3147 non-null   int64 
 7   EducationField            3147 non-null   object
 8   EmployeeCount             3147 non-null   int64 
 9   EnvironmentSatisfaction   3147 non-null   int64 
 10  Gender                    3147 non-null   object
 11  HourlyRate                3147 non-null   int64 
 12  JobInvolvement            3147 non-null   int64 
 13  JobLevel                  3147 non-null   int64 
 14  JobRole                 

In [13]:
new_train.describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0
mean,36.451223,0.138862,850.586273,8.921513,2.926279,1.0,2.741023,66.907531,2.757865,2.029234,2.762313,6339.397521,14556.604067,2.630124,15.046393,3.139816,2.733079,80.0,0.755323,10.97585,2.763902,2.765809,6.900858,4.183985,2.076263,4.156975
std,8.815861,0.345858,390.859919,7.961278,1.032051,0.0,1.089733,19.877918,0.680183,1.09459,1.100224,4610.584686,7117.334257,2.488848,3.537252,0.346851,1.079454,0.0,0.810688,7.509438,1.214918,0.675707,5.998015,3.601691,3.130744,3.572003
min,18.0,0.0,102.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,636.0,0.0,11.0,3.0,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,543.0,2.0,2.0,1.0,2.0,50.0,2.0,1.0,2.0,2888.0,8306.0,1.0,12.0,3.0,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,35.0,0.0,852.0,7.0,3.0,1.0,3.0,67.0,3.0,2.0,3.0,4877.0,14908.0,1.0,14.0,3.0,3.0,80.0,1.0,9.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,42.0,0.0,1198.5,13.0,4.0,1.0,4.0,84.0,3.0,3.0,4.0,7725.0,20744.5,4.0,17.0,3.0,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,2.0,7.0
max,60.0,1.0,3921.0,29.0,15.0,1.0,4.0,100.0,4.0,7.0,4.0,19999.0,26999.0,9.0,25.0,4.0,4.0,80.0,3.0,41.0,6.0,4.0,41.0,18.0,15.0,17.0


In [14]:
#Understanding the co-relation
corr = new_train.corr()
corr.style.background_gradient(cmap='Greens')

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.1565,0.019089,-0.024006,0.214986,,0.019248,0.02501,0.014198,0.494513,-0.008569,0.484906,0.017596,0.30042,-0.025849,-0.007644,0.053845,,0.053035,0.664877,-0.001773,0.005632,0.309317,0.216682,0.211709,0.200993
Attrition,-0.1565,1.0,-0.046732,0.053484,-0.058582,,-0.09853,-0.024578,-0.136723,-0.150946,-0.083643,-0.141174,0.002858,0.041593,-0.019558,0.010335,-0.067564,,-0.161068,-0.152672,-0.038448,-0.055301,-0.129427,-0.145807,-0.033562,-0.14038
DailyRate,0.019089,-0.046732,1.0,0.006007,-0.010259,,0.014082,0.016439,0.014647,0.018918,-0.00099,0.013665,-0.018623,0.006906,-0.003529,-0.026302,0.006541,,0.026551,0.03151,-0.013026,-0.006037,0.014375,0.032413,-0.001939,0.009264
DistanceFromHome,-0.024006,0.053484,0.006007,1.0,0.003512,,-0.005604,0.016782,0.008171,-0.022935,0.011079,-0.03853,0.022792,-0.029539,0.039895,0.034365,-0.003659,,0.043318,-0.013499,-0.017888,-0.022031,-0.006953,0.005913,0.003824,0.013768
Education,0.214986,-0.058582,-0.010259,0.003512,1.0,,-0.018394,0.016324,0.047918,0.092793,-0.021315,0.08715,-0.007876,0.108056,-0.019263,-0.006715,-0.006827,,0.029723,0.150169,-0.018703,-0.000152,0.093917,0.078132,0.051814,0.090916
EmployeeCount,,,,,,,,,,,,,,,,,,,,,,,,,,
EnvironmentSatisfaction,0.019248,-0.09853,0.014082,-0.005604,-0.018394,,1.0,-0.01779,0.004142,0.011412,-0.028822,0.007635,0.01103,-0.016695,-0.038525,-0.04125,-0.015008,,0.002011,0.00949,-0.02459,0.036751,0.005262,0.017246,0.022096,0.009957
HourlyRate,0.02501,-0.024578,0.016439,0.016782,0.016324,,-0.01779,1.0,0.022323,-0.024696,-0.045422,-0.017244,-0.015377,0.040775,0.000875,0.006624,-0.016927,,0.052208,-0.015526,0.013942,0.006339,-0.042453,-0.045275,-0.046918,-0.048041
JobInvolvement,0.014198,-0.136723,0.014647,0.008171,0.047918,,0.004142,0.022323,1.0,-0.005432,0.008445,-0.008157,-0.011353,0.003409,0.017618,-0.008704,0.024939,,0.023378,-0.003323,-0.018043,-0.00308,-0.012976,0.007421,-0.005655,0.017873
JobLevel,0.494513,-0.150946,0.018918,-0.022935,0.092793,,0.011412,-0.024696,-0.005432,1.0,-0.013496,0.929454,0.0371,0.129283,-0.068162,-0.036724,0.020326,,0.018093,0.771992,-0.023013,0.025591,0.531508,0.395806,0.36926,0.388242


In [15]:
new_train["EmployeeCount"].value_counts()

1    3147
Name: EmployeeCount, dtype: int64

In [16]:
new_train.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [17]:
new_train.Attrition.value_counts()

0    2710
1     437
Name: Attrition, dtype: int64

In [18]:
cat_col = [col for col in new_train.columns if new_train[col].dtypes == 'O']

In [19]:
cont_col = [col for col in new_train.columns if new_train[col].dtypes != 'O']

In [20]:
cat_col

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']

In [21]:
cont_col

['Age',
 'Attrition',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EmployeeCount',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

# Preprocessing

Basic structure
1.Not a time series(Random split of train test split can be done)
2. Doesn't contain missing values
3. Contain categorical data
4. Need a normalization

In [22]:
train_full = new_train.copy()

In [23]:
train_full[cat_col]

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
...,...,...,...,...,...,...,...,...
3142,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,No
3143,Travel_Rarely,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
3144,Travel_Frequently,Human Resources,Human Resources,Male,Human Resources,Married,Y,Yes
3145,Travel_Rarely,Sales,Marketing,Male,Sales Executive,Divorced,Y,No


In [24]:
train_full[cont_col]

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,1102,1,2,1,2,94,3,2,4,5993,19479,8,11,3,1,80,0,8,0,1,6,4,0,5
1,49,0,279,8,1,1,3,61,2,2,2,5130,24907,1,23,4,4,80,1,10,3,3,10,7,1,7
2,37,1,1373,2,2,1,4,92,2,1,3,2090,2396,6,15,3,2,80,0,7,3,3,0,0,0,0
3,33,0,1392,3,4,1,4,56,3,1,3,2909,23159,1,11,3,3,80,0,8,3,3,8,7,3,0
4,27,0,591,2,1,1,1,40,3,1,2,3468,16632,9,12,3,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,30,0,945,1,3,1,4,73,3,3,3,8722,14255,1,19,3,2,80,0,10,2,4,10,0,0,8
3143,32,0,1303,2,3,1,1,48,3,1,2,3544,15972,4,19,3,4,80,1,10,3,4,4,2,1,3
3144,29,1,1184,24,3,1,2,36,2,1,1,2804,15322,1,11,3,3,80,0,1,2,3,1,0,0,0
3145,36,0,441,9,2,1,2,48,4,2,3,5406,4051,1,21,4,3,80,2,10,3,2,10,3,0,8


In [25]:
train_model = train_full.drop("Attrition", axis = 1)
train_model

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,30,Travel_Rarely,945,Sales,1,3,Life Sciences,1,4,Female,73,3,3,Sales Executive,3,Single,8722,14255,1,Y,No,19,3,2,80,0,10,2,4,10,0,0,8
3143,32,Travel_Rarely,1303,Research & Development,2,3,Life Sciences,1,1,Male,48,3,1,Research Scientist,2,Married,3544,15972,4,Y,No,19,3,4,80,1,10,3,4,4,2,1,3
3144,29,Travel_Frequently,1184,Human Resources,24,3,Human Resources,1,2,Male,36,2,1,Human Resources,1,Married,2804,15322,1,Y,Yes,11,3,3,80,0,1,2,3,1,0,0,0
3145,36,Travel_Rarely,441,Sales,9,2,Marketing,1,2,Male,48,4,2,Sales Executive,3,Divorced,5406,4051,1,Y,No,21,4,3,80,2,10,3,2,10,3,0,8


In [26]:
X = train_model

In [27]:
X

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,30,Travel_Rarely,945,Sales,1,3,Life Sciences,1,4,Female,73,3,3,Sales Executive,3,Single,8722,14255,1,Y,No,19,3,2,80,0,10,2,4,10,0,0,8
3143,32,Travel_Rarely,1303,Research & Development,2,3,Life Sciences,1,1,Male,48,3,1,Research Scientist,2,Married,3544,15972,4,Y,No,19,3,4,80,1,10,3,4,4,2,1,3
3144,29,Travel_Frequently,1184,Human Resources,24,3,Human Resources,1,2,Male,36,2,1,Human Resources,1,Married,2804,15322,1,Y,Yes,11,3,3,80,0,1,2,3,1,0,0,0
3145,36,Travel_Rarely,441,Sales,9,2,Marketing,1,2,Male,48,4,2,Sales Executive,3,Divorced,5406,4051,1,Y,No,21,4,3,80,2,10,3,2,10,3,0,8


In [28]:
y = train_full["Attrition"]

In [29]:
y

0       1
1       0
2       1
3       0
4       0
       ..
3142    0
3143    0
3144    1
3145    0
3146    0
Name: Attrition, Length: 3147, dtype: int64

In [30]:
#Taking small amount of data for Validation
X_train, X_valid,y_train, y_valid = train_test_split(X, y,test_size = 0.15 ,random_state = 42)

In [31]:
X_train.shape, y_train.shape

((2674, 33), (2674,))

In [32]:
X_valid.shape, y_valid.shape

((473, 33), (473,))

# Bundle up the preprocessing by making pipeline

In [33]:
numerical_col = make_pipeline(StandardScaler(with_mean = False))
categorical_col = make_pipeline(OneHotEncoder(),
                               StandardScaler(with_mean = False))

In [34]:
#Removing the target column
cont_col.remove("Attrition")

In [35]:
preprocessor = make_column_transformer(
    (numerical_col, cont_col),
    (categorical_col, cat_col))

In [36]:
preprocessor

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler(with_mean=False))]),
                                 ['Age', 'DailyRate', 'DistanceFromHome',
                                  'Education', 'EmployeeCount',
                                  'EnvironmentSatisfaction', 'HourlyRate',
                                  'JobInvolvement', 'JobLevel',
                                  'JobSatisfaction', 'MonthlyIncome',
                                  'MonthlyRate', 'NumCompaniesWorked',
                                  'PercentSalaryHike', 'PerformanceR...
                                  'TrainingTimesLastYear', 'WorkLifeBalance',
                                  'YearsAtCompany', 'YearsInCurrentRole',
                                  'YearsSinceLastPromotion',
                                  'YearsWithCurrManager']),
                                ('pi

# Random Forest

In [37]:
model = RandomForestClassifier(n_estimators=500, random_state=42)
my_pipeline = make_pipeline(preprocessor,
                           model)

In [38]:
my_pipeline.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler(with_mean=False))]),
                                                  ['Age', 'DailyRate',
                                                   'DistanceFromHome',
                                                   'Education', 'EmployeeCount',
                                                   'EnvironmentSatisfaction',
                                                   'HourlyRate',
                                                   'JobInvolvement', 'JobLevel',
                                                   'JobSatisfaction',
                                                   'MonthlyIncome',
                                                   'MonthlyRate',
                                                   'NumComp

In [39]:
preds = my_pipeline.score(X_valid, y_valid)

In [40]:
preds

0.879492600422833

# Gradient Boosting

In [41]:
model1 = HistGradientBoostingClassifier(learning_rate=0.001)

In [42]:
my_pipeline1 = make_pipeline(preprocessor,
                           model1)

In [43]:
my_pipeline1.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler(with_mean=False))]),
                                                  ['Age', 'DailyRate',
                                                   'DistanceFromHome',
                                                   'Education', 'EmployeeCount',
                                                   'EnvironmentSatisfaction',
                                                   'HourlyRate',
                                                   'JobInvolvement', 'JobLevel',
                                                   'JobSatisfaction',
                                                   'MonthlyIncome',
                                                   'MonthlyRate',
                                                   'NumComp

In [44]:
my_pipeline1.score(X_valid, y_valid)

0.8668076109936576

In [45]:
test.columns

Index(['id', 'Age', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [46]:
test = test.drop(["id"], axis=1)

# Submitting the prediction with pipeline which is made of random forest

In [47]:
sub1 = my_pipeline.predict(test)

In [48]:
sub1

array([0, 0, 0, ..., 0, 0, 0])

In [49]:
# Submission
rf_sub = pd.DataFrame({"id": sub.id, "Attrition": sub1})

In [50]:
rf_sub.head()

Unnamed: 0,id,Attrition
0,1677,0
1,1678,0
2,1679,0
3,1680,0
4,1681,1


In [51]:
rf_sub.to_csv("submission.csv", index = False)

# Note
1. There are lot more work todo like feature importance, hyper parameter tuning trying various model which will increase the accuracy.
2. One lesson i learned today is start a competition early which gives more time to spend on.
3. I am a big fan of fastai this notebookis more of tradition way of solving problems.