In [1]:
# Import libraries
import pandas as pd
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Read in CSV
file_path = "Resources/Employee.csv"
employee_df = pd.read_csv(file_path)
employee_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [3]:
# Define features
X = employee_df.drop(columns=['LeaveOrNot'])
X.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,Bachelors,2017,Bangalore,3,34,Male,No,0
1,Bachelors,2013,Pune,1,28,Female,No,3
2,Bachelors,2014,New Delhi,3,38,Female,No,2
3,Masters,2016,Bangalore,3,27,Male,No,5
4,Masters,2017,Pune,3,24,Male,Yes,2


In [4]:
# Define target
y = employee_df['LeaveOrNot']
y.head()

0    0
1    1
2    0
3    1
4    1
Name: LeaveOrNot, dtype: int64

In [5]:
# Label Encoding for Education, Gender, and EverBenched
label_encoder = LabelEncoder()
X['Education'] = label_encoder.fit_transform(X['Education'])
X['Gender'] = label_encoder.fit_transform(X['Gender'])
X['EverBenched'] = label_encoder.fit_transform(X['EverBenched'])
X.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,0,2017,Bangalore,3,34,1,0,0
1,0,2013,Pune,1,28,0,0,3
2,0,2014,New Delhi,3,38,0,0,2
3,1,2016,Bangalore,3,27,1,0,5
4,1,2017,Pune,3,24,1,1,2


In [6]:
# One-Hot Encoding for City
X = pd.get_dummies(X, columns=['City']).astype(int)
X.head()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,City_Bangalore,City_New Delhi,City_Pune
0,0,2017,3,34,1,0,0,1,0,0
1,0,2013,1,28,0,0,3,0,0,1
2,0,2014,3,38,0,0,2,0,1,0
3,1,2016,3,27,1,0,5,1,0,0
4,1,2017,3,24,1,1,2,0,0,1


In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Create StandardScaler instance
scaler = StandardScaler()

In [9]:
# Fit and scale the training data
X_train_scaled = scaler.fit_transform(X_train)

In [10]:
# Fit and scale the test data
X_test_scaled = scaler.fit_transform(X_test)

In [11]:
# Initialize Random Forest model
KNN_model = KNeighborsClassifier(n_neighbors=3)

In [12]:
# Fit the model
KNN_model.fit(X_train_scaled, y_train)

In [13]:
# Make predictions using the scaled test data
y_pred = KNN_model.predict(X_test_scaled)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [14]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8325


In [15]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88       775
           1       0.81      0.66      0.72       389

    accuracy                           0.83      1164
   macro avg       0.82      0.79      0.80      1164
weighted avg       0.83      0.83      0.83      1164



In [16]:
# Print confusion matrix
print(confusion_matrix(y_test, y_pred))

[[714  61]
 [134 255]]


## Optimization Techniques

In [17]:
employee_copy = employee_df.copy()

In [18]:
employee_copy.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [19]:
drop_2018 = employee_copy.drop(employee_copy[employee_copy['JoiningYear'] == 2018].index)
drop_2018.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [20]:
X_1 = drop_2018.drop(columns=['LeaveOrNot'])
X_1.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,Bachelors,2017,Bangalore,3,34,Male,No,0
1,Bachelors,2013,Pune,1,28,Female,No,3
2,Bachelors,2014,New Delhi,3,38,Female,No,2
3,Masters,2016,Bangalore,3,27,Male,No,5
4,Masters,2017,Pune,3,24,Male,Yes,2


In [21]:
y_1 = drop_2018['LeaveOrNot']
y_1.head()

0    0
1    1
2    0
3    1
4    1
Name: LeaveOrNot, dtype: int64

In [23]:
label_encoder = LabelEncoder()
X_1['Education'] = label_encoder.fit_transform(X_1['Education'])
X_1['Gender'] = label_encoder.fit_transform(X_1['Gender'])
X_1['EverBenched'] = label_encoder.fit_transform(X_1['EverBenched'])
X_1.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,0,2017,Bangalore,3,34,1,0,0
1,0,2013,Pune,1,28,0,0,3
2,0,2014,New Delhi,3,38,0,0,2
3,1,2016,Bangalore,3,27,1,0,5
4,1,2017,Pune,3,24,1,1,2


In [24]:
X_1 = pd.get_dummies(X_1, columns=['City']).astype(int)
X_1.head()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,City_Bangalore,City_New Delhi,City_Pune
0,0,2017,3,34,1,0,0,1,0,0
1,0,2013,1,28,0,0,3,0,0,1
2,0,2014,3,38,0,0,2,0,1,0
3,1,2016,3,27,1,0,5,1,0,0
4,1,2017,3,24,1,1,2,0,0,1


In [28]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, random_state=42)

In [29]:
# Create StandardScaler instance
scaler_1 = StandardScaler()

In [30]:
X_train_scaled = scaler_1.fit_transform(X_train)

In [32]:
X_test_scaled = scaler.fit_transform(X_test)

In [35]:
KNN_model_1 = KNeighborsClassifier(n_neighbors=3)

In [36]:
KNN_model_1.fit(X_train_scaled, y_train)

In [37]:
y_pred_1 = KNN_model_1.predict(X_test_scaled)
y_pred_1

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [38]:
accuracy = accuracy_score(y_test, y_pred_1)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8004


In [39]:
print(classification_report(y_test, y_pred_1))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       748
           1       0.73      0.55      0.62       324

    accuracy                           0.80      1072
   macro avg       0.77      0.73      0.74      1072
weighted avg       0.79      0.80      0.79      1072

