In [167]:
#Imports 

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler 
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,f1_score

from sklearn.model_selection import GridSearchCV

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

import pickle

In [194]:
#Dataset used - https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset/data
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [169]:
df.drop(columns=['EmployeeCount',
       'EmployeeNumber', 'MonthlyRate', 'HourlyRate',
       'Over18', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'WorkLifeBalance'], inplace=True)

In [170]:
#Splitting the data

X = df.drop('Attrition',axis=1)
y = df.Attrition

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [200]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1176 entries, 1097 to 1126
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1176 non-null   int64 
 1   BusinessTravel           1176 non-null   object
 2   DailyRate                1176 non-null   int64 
 3   Department               1176 non-null   object
 4   DistanceFromHome         1176 non-null   int64 
 5   Education                1176 non-null   int64 
 6   EducationField           1176 non-null   object
 7   EnvironmentSatisfaction  1176 non-null   int64 
 8   Gender                   1176 non-null   object
 9   JobInvolvement           1176 non-null   int64 
 10  JobLevel                 1176 non-null   int64 
 11  JobRole                  1176 non-null   object
 12  JobSatisfaction          1176 non-null   int64 
 13  MaritalStatus            1176 non-null   object
 14  MonthlyIncome            1176 non-nul

In [172]:
y_train

1097     No
727      No
254      No
1175     No
1341     No
       ... 
1130     No
1294     No
860     Yes
1459     No
1126     No
Name: Attrition, Length: 1176, dtype: object

In [173]:
#Encoding the target column "Attrition"
y_train = y_train.map({'Yes': 1, 'No': 0})
y_test = y_test.map({'Yes': 1, 'No': 0})

In [218]:
#Column transformer for encoding categorical columns

encoder = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'OverTime','MaritalStatus']),  # Ordinal encoding
        ('ohe', OneHotEncoder(drop='first', sparse=False), ['Gender']),  # One-hot encoding with drop_first=True for 'Gender'
    ],
    remainder='passthrough'  # Keep the other columns unchanged
)

#Setting to get a pandas df
encoder.set_output(transform='pandas')

In [219]:
#Define the pipeline
pipe = Pipeline([
    ('preprocessing', encoder),  
    ('scaling', MinMaxScaler()), 
#     ('feature_selection', SelectKBest(score_func=chi2, k=15)),  
])

#Fit the pipeline on the training data
pipe.fit(X_train, y_train)



In [220]:
# Transform both the training and testing data
X_train_transformed = pd.DataFrame(pipe.transform(X_train))
X_test_transformed = pd.DataFrame(pipe.transform(X_test))

In [221]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1176 non-null   float64
 1   1       1176 non-null   float64
 2   2       1176 non-null   float64
 3   3       1176 non-null   float64
 4   4       1176 non-null   float64
 5   5       1176 non-null   float64
 6   6       1176 non-null   float64
 7   7       1176 non-null   float64
 8   8       1176 non-null   float64
 9   9       1176 non-null   float64
 10  10      1176 non-null   float64
 11  11      1176 non-null   float64
 12  12      1176 non-null   float64
 13  13      1176 non-null   float64
 14  14      1176 non-null   float64
 15  15      1176 non-null   float64
 16  16      1176 non-null   float64
 17  17      1176 non-null   float64
 18  18      1176 non-null   float64
 19  19      1176 non-null   float64
 20  20      1176 non-null   float64
 21  21      1176 non-null   float64
 22  

# Decision Tree

In [222]:
model = DecisionTreeClassifier()
model.fit(X_train_transformed, y_train)

# prediction with x_test
y_pred=model.predict(X_test_transformed)
#prediction with x_train
y_train_predict=model.predict(X_train_transformed)

In [223]:
#Decision Tree Accuracy
accuracy_score(y_test,y_pred)

0.7857142857142857

# Hyper Parameter Tuning for Decision tree

In [224]:
params = {
    "criterion":("gini", "entropy"),
    "splitter":("best", "random"),
    "max_depth":(list(range(1, 20))),
    "min_samples_split":[2, 3, 4],
    "min_samples_leaf":list(range(1, 20)),
}

In [225]:
tree_clf = DecisionTreeClassifier(random_state=3)
tree_cv = GridSearchCV(tree_clf, params, scoring="accuracy", n_jobs=-1, verbose=1, cv=3)
tree_cv.fit(X_train_transformed,y_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

Fitting 3 folds for each of 4332 candidates, totalling 12996 fits
Best paramters: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 15, 'min_samples_split': 2, 'splitter': 'best'})


In [226]:
tree_cv.best_score_

0.8503401360544217

In [227]:
y_test_predict=model.predict(X_test_transformed)#predicting training data to check training performance
y_test_predict

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [228]:
# Tuned Decision Tree Accuracy
accuracy_score(y_test,y_test_predict)

0.7857142857142857

# XGBoost

In [229]:
# Instantiate XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Train the model
xgb_model.fit(X_train_transformed, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test_transformed)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8809523809523809
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.93       255
           1       0.62      0.26      0.36        39

    accuracy                           0.88       294
   macro avg       0.76      0.62      0.65       294
weighted avg       0.86      0.88      0.86       294



# Hyper Parameter Tuning XGBoost

In [230]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_transformed, y_train)

# Print the best parameters found by grid search
print("Best Parameters:", grid_search.best_params_)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_transformed)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.6}
Accuracy: 0.891156462585034
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94       255
           1       0.73      0.28      0.41        39

    accuracy                           0.89       294
   macro avg       0.82      0.63      0.67       294
weighted avg       0.88      0.89      0.87       294



In [232]:
#Save the trained model to a file
with open('Attrition_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)