# **Week 5 project: Employee Attrition Predcition**


In [194]:
# importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [195]:
# importing scikit learn libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [196]:
# loading dataset
emp=pd.read_csv('/content/HR Employee Attrition.csv')
emp.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [197]:
# counting values in the Attrition column
print(emp['Attrition'].value_counts())

Attrition
No     1233
Yes     237
Name: count, dtype: int64


In [198]:
no=emp[emp['Attrition']=="No"]
yes=emp[emp['Attrition']=="Yes"]

In [199]:
# resampling data because of unbalance
from sklearn.utils import resample

In [200]:
print(emp['Attrition'].unique())

['Yes' 'No']


In [203]:
print("Length Yes, No: ",len(yes),len(no))

Length Yes, No:  237 1233


## **Resampling the unbalanced Data**

In [208]:
# performing Resampling
resample_yes = resample(
    yes,
    replace=True,
    n_samples=len(no),
    random_state=42
)

In [209]:
# merging the exiting data with resampled data
emp=pd.concat([no,resample_yes])

In [210]:
print(emp['Attrition'].value_counts())

Attrition
No     1233
Yes    1233
Name: count, dtype: int64


### **Label Encoding**

In [211]:
le=LabelEncoder()

In [212]:
# encodign the columns that are not numeric
for col in emp.select_dtypes(include=['object']).columns:
  emp[col]=le.fit_transform(emp[col])

In [213]:
x=emp.drop('Attrition', axis=1)
y=emp['Attrition']

### **Standard Scaling**

In [214]:
scale=StandardScaler()

In [215]:
sx_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [216]:
# scaling the input
x_train=scale.fit_transform(x_train)
x_test=scale.transform(x_test)

## **Decision Tree**

In [217]:
dt=DecisionTreeClassifier(criterion='gini', max_depth=3,random_state=42)

In [218]:
dt.fit(x_train,y_train)

In [219]:
y_pred=dt.predict(x_test)

In [220]:
print("Accuracy of Decision Tree:",accuracy_score(y_test,y_pred))

Accuracy of Decision Tree: 0.7148648648648649


In [221]:
print("Classification Report Decision Tree:\n",classification_report(y_test,y_pred))

Classification Report Decision Tree:
               precision    recall  f1-score   support

           0       0.67      0.85      0.75       367
           1       0.80      0.58      0.67       373

    accuracy                           0.71       740
   macro avg       0.73      0.72      0.71       740
weighted avg       0.73      0.71      0.71       740



## **Bagging**

In [222]:
bag_model=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=50,
    random_state=42
)

In [223]:
bag_model.fit(x_train,y_train)

In [224]:
y_pred_bag=bag_model.predict(x_test)

In [225]:
print("Classification Report Bagging:\n",classification_report(y_test,y_pred_bag))

Classification Report Bagging:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95       367
           1       0.93      0.98      0.96       373

    accuracy                           0.95       740
   macro avg       0.96      0.95      0.95       740
weighted avg       0.96      0.95      0.95       740



## **Random Forest**

In [226]:
rf=RandomForestClassifier(n_estimators=50,random_state=42)

In [227]:
rf.fit(x_train,y_train)

In [228]:
y_pred_rf=rf.predict(x_test)

In [229]:
print("Classification Report Random Forest:\n",classification_report(y_test,y_pred_rf))

Classification Report Random Forest:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       367
           1       0.96      0.98      0.97       373

    accuracy                           0.97       740
   macro avg       0.97      0.97      0.97       740
weighted avg       0.97      0.97      0.97       740



## **K Nearest Neighbor**

In [230]:
knn=KNeighborsClassifier(n_neighbors=9,metric='euclidean')

In [231]:
knn.fit(x_train,y_train)

In [232]:
y_pred_knn=knn.predict(x_test)

In [233]:
print("Classification Report KNN:\n",classification_report(y_test,y_pred_knn))

Classification Report KNN:
               precision    recall  f1-score   support

           0       0.74      0.74      0.74       367
           1       0.75      0.75      0.75       373

    accuracy                           0.74       740
   macro avg       0.74      0.74      0.74       740
weighted avg       0.74      0.74      0.74       740



## **Performing Grid Search**

In [234]:
# min samples= This controls the minimum number of samples required to split an internal node.
params_grid={
    "n_estimators": [50,100,150],
    "max_depth": [None,5,10],
    "min_samples_split": [2,5,10]
}

In [235]:
# n_jobs=It controls how many CPU cores to use when running tasks in parallel, -1 means utalize all
grid=GridSearchCV(RandomForestClassifier(random_state=42),params_grid,cv=5,scoring='f1',n_jobs=5)

In [236]:
grid.fit(x_train,y_train)

In [237]:
print("Best Random Forest Parameters:",grid.best_params_)

Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}


In [238]:
print("Best F1 Score:",grid.best_score_)

Best F1 Score: 0.9544476999787881
