In [101]:
#Imports 

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler 
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,f1_score

from sklearn.model_selection import GridSearchCV

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

import pickle

In [113]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [114]:
df.drop(columns=['EmployeeCount',
       'EmployeeNumber', 'MonthlyRate', 'HourlyRate',
       'Over18', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'WorkLifeBalance'], inplace=True)

In [115]:
#Splitting the data

X = df.drop('Attrition',axis=1)
y = df.Attrition

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
X_train

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,...,NumCompaniesWorked,OverTime,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1097,24,Travel_Rarely,350,Research & Development,21,2,Technical Degree,3,Male,2,...,0,No,14,3,2,3,1,1,0,0
727,18,Non-Travel,287,Research & Development,5,2,Life Sciences,2,Male,3,...,1,No,15,0,0,2,0,0,0,0
254,29,Travel_Rarely,1247,Sales,20,2,Marketing,4,Male,3,...,2,No,14,1,10,2,3,2,0,2
1175,39,Travel_Rarely,492,Research & Development,12,3,Medical,4,Male,3,...,4,No,21,0,7,3,5,4,1,0
1341,31,Travel_Rarely,311,Research & Development,20,3,Life Sciences,2,Male,3,...,1,No,11,1,10,2,10,8,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,35,Travel_Rarely,750,Research & Development,28,3,Life Sciences,2,Male,4,...,1,No,17,2,10,3,10,9,6,8
1294,41,Travel_Rarely,447,Research & Development,5,3,Life Sciences,2,Male,4,...,3,No,12,0,11,3,3,2,1,2
860,22,Travel_Frequently,1256,Research & Development,3,4,Life Sciences,3,Male,2,...,0,Yes,11,1,1,5,0,0,0,0
1459,29,Travel_Rarely,1378,Research & Development,13,2,Other,4,Male,2,...,4,Yes,13,1,10,2,4,3,0,3


In [117]:
y_train

1097     No
727      No
254      No
1175     No
1341     No
       ... 
1130     No
1294     No
860     Yes
1459     No
1126     No
Name: Attrition, Length: 1176, dtype: object

In [118]:
#Encoding the target column "Attrition"
y_train = y_train.map({'Yes': 1, 'No': 0})
y_test = y_test.map({'Yes': 1, 'No': 0})

In [119]:
#Column transformer for encoding categorical columns

encoder = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'OverTime']),  # Ordinal encoding
        ('ohe', OneHotEncoder(drop='first', sparse=False), ['Gender', 'MaritalStatus']),  # One-hot encoding with drop_first=True for 'Gender'
    ],
    remainder='passthrough'  # Keep the other columns unchanged
)

#Setting to get a pandas df
encoder.set_output(transform='pandas')

In [120]:
#Define the pipeline
pipe = Pipeline([
    ('preprocessing', encoder),  # Assuming `encoder` is your previously defined encoder
    ('scaling', MinMaxScaler()),  # Scaling step
    ('feature_selection', SelectKBest(score_func=chi2, k=15)),  # Feature selection step
])

#Fit the pipeline on the training data
pipe.fit(X_train, y_train)



In [121]:
# Transform both the training and testing data
X_train_transformed = pd.DataFrame(pipe.transform(X_train))
X_test_transformed = pd.DataFrame(pipe.transform(X_test))

In [151]:
model = DecisionTreeClassifier()
model.fit(X_train_transformed, y_train)

# prediction with x_test
y_pred=model.predict(X_test_transformed)
#prediction with x_train
y_train_predict=model.predict(X_train_transformed)

In [152]:
# training accuracy
accuracy_score(y_train,y_train_predict)

1.0

In [153]:
# testing accuracy
accuracy_score(y_test,y_pred)

0.7721088435374149

In [154]:
params = {
    "criterion":("gini", "entropy"),
    "splitter":("best", "random"),
    "max_depth":(list(range(1, 20))),
    "min_samples_split":[2, 3, 4],
    "min_samples_leaf":list(range(1, 20)),
}

In [155]:
tree_clf = DecisionTreeClassifier(random_state=3)
tree_cv = GridSearchCV(tree_clf, params, scoring="accuracy", n_jobs=-1, verbose=1, cv=3)
tree_cv.fit(X_train_transformed,y_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

Fitting 3 folds for each of 4332 candidates, totalling 12996 fits
Best paramters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'})


In [156]:
tree_cv.best_score_

0.8469387755102041

In [160]:
y_test_predict=model.predict(X_test_transformed)#predicting training data to check training performance
y_test_predict

array([1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [166]:
# Training score
accuracy_score(y_test,y_test_predict)

0.8061224489795918

In [163]:
# Instantiate XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Train the model
xgb_model.fit(X_train_transformed, y_test)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test_transformed)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8571428571428571
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       255
           1       0.43      0.26      0.32        39

    accuracy                           0.86       294
   macro avg       0.66      0.60      0.62       294
weighted avg       0.83      0.86      0.84       294



In [164]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_transformed, y_test)

# Print the best parameters found by grid search
print("Best Parameters:", grid_search.best_params_)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_transformed)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.6}
Accuracy: 0.8639455782312925
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93       255
           1       0.47      0.18      0.26        39

    accuracy                           0.86       294
   macro avg       0.68      0.57      0.59       294
weighted avg       0.83      0.86      0.84       294



In [165]:
#Save the trained model to a file
with open('Attrition_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
