In [264]:
# Import libraries
import pandas as pd
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

In [265]:
# Read in CSV
file_path = "Resources/Employee.csv"
employee_df = pd.read_csv(file_path)
employee_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [266]:
# Define features
X = employee_df.drop(columns=['LeaveOrNot'])
X.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,Bachelors,2017,Bangalore,3,34,Male,No,0
1,Bachelors,2013,Pune,1,28,Female,No,3
2,Bachelors,2014,New Delhi,3,38,Female,No,2
3,Masters,2016,Bangalore,3,27,Male,No,5
4,Masters,2017,Pune,3,24,Male,Yes,2


In [267]:
# Define target
y = employee_df['LeaveOrNot']
y.head()

0    0
1    1
2    0
3    1
4    1
Name: LeaveOrNot, dtype: int64

In [268]:
# Label Encoding for Education, Gender, and EverBenched
label_encoder = LabelEncoder()
X['Education'] = label_encoder.fit_transform(X['Education'])
X['Gender'] = label_encoder.fit_transform(X['Gender'])
X['EverBenched'] = label_encoder.fit_transform(X['EverBenched'])
X.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,0,2017,Bangalore,3,34,1,0,0
1,0,2013,Pune,1,28,0,0,3
2,0,2014,New Delhi,3,38,0,0,2
3,1,2016,Bangalore,3,27,1,0,5
4,1,2017,Pune,3,24,1,1,2


In [269]:
# One-Hot Encoding for City
X = pd.get_dummies(X, columns=['City']).astype(int)
X.head()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,City_Bangalore,City_New Delhi,City_Pune
0,0,2017,3,34,1,0,0,1,0,0
1,0,2013,1,28,0,0,3,0,0,1
2,0,2014,3,38,0,0,2,0,1,0
3,1,2016,3,27,1,0,5,1,0,0
4,1,2017,3,24,1,1,2,0,0,1


In [270]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [271]:
# Create StandardScaler instance
scaler = StandardScaler()

In [272]:
# Fit and scale the training data
X_train_scaled = scaler.fit_transform(X_train)

In [273]:
# Fit and scale the test data
X_test_scaled = scaler.fit_transform(X_test)

In [274]:
# Initialize Random Forest model
KNN_model = KNeighborsClassifier(n_neighbors=3)

In [275]:
# Fit the model
KNN_model.fit(X_train_scaled, y_train)

In [276]:
# Make predictions using the scaled test data
y_pred = KNN_model.predict(X_test_scaled)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [277]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8325


In [278]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88       775
           1       0.81      0.66      0.72       389

    accuracy                           0.83      1164
   macro avg       0.82      0.79      0.80      1164
weighted avg       0.83      0.83      0.83      1164



In [279]:
# Print confusion matrix
print(confusion_matrix(y_test, y_pred))

[[714  61]
 [134 255]]


## Optimization Techniques

In [280]:
employee_copy = employee_df.copy()

In [281]:
employee_copy.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [282]:
drop_2018 = employee_copy.drop(employee_copy[employee_copy['JoiningYear'] == 2018].index)
drop_2018.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [283]:
X_1 = drop_2018.drop(columns=['LeaveOrNot'])
X_1.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,Bachelors,2017,Bangalore,3,34,Male,No,0
1,Bachelors,2013,Pune,1,28,Female,No,3
2,Bachelors,2014,New Delhi,3,38,Female,No,2
3,Masters,2016,Bangalore,3,27,Male,No,5
4,Masters,2017,Pune,3,24,Male,Yes,2


In [284]:
y_1 = drop_2018['LeaveOrNot']
y_1.head()

0    0
1    1
2    0
3    1
4    1
Name: LeaveOrNot, dtype: int64

In [285]:
label_encoder = LabelEncoder()
X_1['Education'] = label_encoder.fit_transform(X_1['Education'])
X_1['Gender'] = label_encoder.fit_transform(X_1['Gender'])
X_1['EverBenched'] = label_encoder.fit_transform(X_1['EverBenched'])
X_1.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,0,2017,Bangalore,3,34,1,0,0
1,0,2013,Pune,1,28,0,0,3
2,0,2014,New Delhi,3,38,0,0,2
3,1,2016,Bangalore,3,27,1,0,5
4,1,2017,Pune,3,24,1,1,2


In [286]:
X_1 = pd.get_dummies(X_1, columns=['City']).astype(int)
X_1.head()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,City_Bangalore,City_New Delhi,City_Pune
0,0,2017,3,34,1,0,0,1,0,0
1,0,2013,1,28,0,0,3,0,0,1
2,0,2014,3,38,0,0,2,0,1,0
3,1,2016,3,27,1,0,5,1,0,0
4,1,2017,3,24,1,1,2,0,0,1


In [321]:
# Split the data into training and test sets
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, random_state=42)

In [322]:
# Create StandardScaler instance
scaler_1 = StandardScaler()

In [323]:
X_train_scaled_1 = scaler_1.fit_transform(X_train_1)

In [324]:
X_test_scaled_1 = scaler.fit_transform(X_test_1)

In [325]:
KNN_model_1 = KNeighborsClassifier(n_neighbors=3)

In [326]:
KNN_model_1.fit(X_train_scaled_1, y_train_1)

In [327]:
y_pred_1 = KNN_model_1.predict(X_test_scaled_1)
y_pred_1

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [328]:
accuracy = accuracy_score(y_test_1, y_pred_1)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8004


In [329]:
print(classification_report(y_test_1, y_pred_1))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       748
           1       0.73      0.55      0.62       324

    accuracy                           0.80      1072
   macro avg       0.77      0.73      0.74      1072
weighted avg       0.79      0.80      0.79      1072



## SMOTE

In [330]:
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

In [331]:
X_resampled, y_resampled = smote_sampler.fit_resample(X_train_scaled_1, y_train_1)

In [332]:
y_resampled.value_counts()

LeaveOrNot
0    2300
1    2300
Name: count, dtype: int64

In [338]:
KNN_model_smote = KNeighborsClassifier(n_neighbors=5)

In [339]:
KNN_model_smote.fit(X_train_scaled_1, y_train_1)

In [350]:
smote_y_pred = KNN_model_smote.predict(X_test_scaled_1)

In [351]:
print(f"Classifiction Report - Original Data")
print(classification_report(y_test_1, y_pred_1))
print("---------")
print(f"Classifiction Report - Redsampled Data - SMOTE")
print(classification_report(y_test_1, smote_y_pred))

Classifiction Report - Original Data
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       748
           1       0.73      0.55      0.62       324

    accuracy                           0.80      1072
   macro avg       0.77      0.73      0.74      1072
weighted avg       0.79      0.80      0.79      1072

---------
Classifiction Report - Redsampled Data - SMOTE
              precision    recall  f1-score   support

           0       0.82      0.93      0.87       748
           1       0.77      0.53      0.63       324

    accuracy                           0.81      1072
   macro avg       0.79      0.73      0.75      1072
weighted avg       0.80      0.81      0.80      1072



## SMOTEENN

In [342]:
smote_enn = SMOTEENN(random_state=1)

In [343]:
X_resampled_1, y_resampled_1 = smote_enn.fit_resample(X_train_scaled_1, y_train_1)

In [344]:
y_resampled_1.value_counts()

LeaveOrNot
0    1472
1    1439
Name: count, dtype: int64

In [345]:
KNN_model_smoteenn = KNeighborsClassifier(n_neighbors=3)

In [346]:
KNN_model_smoteenn.fit(X_resampled_1, y_resampled_1)

In [352]:
smoteenn_y_pred = KNN_model_smoteenn.predict(X_test_scaled_1)

In [353]:
# Print classification reports
print(f"Classifiction Report - Original Data")
print(classification_report(y_test_1, y_pred_1))
print("---------")
print(f"Classifiction Report - Redsampled Data - SMOTEENN")
print(classification_report(y_test_1, smoteenn_y_pred))

Classifiction Report - Original Data
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       748
           1       0.73      0.55      0.62       324

    accuracy                           0.80      1072
   macro avg       0.77      0.73      0.74      1072
weighted avg       0.79      0.80      0.79      1072

---------
Classifiction Report - Redsampled Data - SMOTEENN
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       748
           1       0.61      0.64      0.63       324

    accuracy                           0.77      1072
   macro avg       0.73      0.73      0.73      1072
weighted avg       0.77      0.77      0.77      1072

