In [1]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

In [2]:
# Read in CSV
file_path = "Resources/Employee.csv"
employee_df = pd.read_csv(file_path)
employee_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [3]:
# Remove data points where JoiningYear is 2018
employee_df = employee_df[employee_df['JoiningYear'] != 2018].reset_index(drop=True)
employee_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [4]:
# Define features
X = employee_df.drop(columns=['LeaveOrNot'])
X.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,Bachelors,2017,Bangalore,3,34,Male,No,0
1,Bachelors,2013,Pune,1,28,Female,No,3
2,Bachelors,2014,New Delhi,3,38,Female,No,2
3,Masters,2016,Bangalore,3,27,Male,No,5
4,Masters,2017,Pune,3,24,Male,Yes,2


In [5]:
# Define target
y = employee_df['LeaveOrNot']
y.head()

0    0
1    1
2    0
3    1
4    1
Name: LeaveOrNot, dtype: int64

In [6]:
# Label Encoding for Education, Gender, and EverBenched
label_encoder = LabelEncoder()
X['Education'] = label_encoder.fit_transform(X['Education'])
X['Gender'] = label_encoder.fit_transform(X['Gender'])
X['EverBenched'] = label_encoder.fit_transform(X['EverBenched'])
X.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,0,2017,Bangalore,3,34,1,0,0
1,0,2013,Pune,1,28,0,0,3
2,0,2014,New Delhi,3,38,0,0,2
3,1,2016,Bangalore,3,27,1,0,5
4,1,2017,Pune,3,24,1,1,2


In [7]:
# One-Hot Encoding for City
X = pd.get_dummies(X, columns=['City']).astype(int)
X.head()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,City_Bangalore,City_New Delhi,City_Pune
0,0,2017,3,34,1,0,0,1,0,0
1,0,2013,1,28,0,0,3,0,0,1
2,0,2014,3,38,0,0,2,0,1,0
3,1,2016,3,27,1,0,5,1,0,0
4,1,2017,3,24,1,1,2,0,0,1


In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# Apply SMOTENN (SMOTE + Tomek Links)
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

In [10]:
# Create StandardScaler instance
scaler = StandardScaler()

In [11]:
# Scale the resampled data
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42, class_weight="balanced")

In [13]:
# Train the model
rf_model.fit(X_train_scaled, y_train_resampled)

In [14]:
# Make predictions using probabilities
y_prob = rf_model.predict_proba(X_test_scaled)[:, 1]

In [15]:
# Adjust decision threshold to increase recall
threshold = 0.4
predictions = (y_prob > threshold).astype(int)
predictions

array([0, 0, 0, ..., 0, 1, 0])

In [16]:
# Evaluate the model
recall = recall_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
precision = classification_report(y_test, predictions, output_dict=True)["1"]["precision"]

print(f"Test Set Recall: {recall:.4f}")
print(f"Test Set Accuracy: {accuracy:.4f}")
print(f"Test Set Precision: {precision:.4f}")

Test Set Recall: 0.6451
Test Set Accuracy: 0.7388
Test Set Precision: 0.5588


In [17]:
# Print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.84      0.78      0.81       748
           1       0.56      0.65      0.60       324

    accuracy                           0.74      1072
   macro avg       0.70      0.71      0.70      1072
weighted avg       0.75      0.74      0.74      1072



In [18]:
# Print confusion matrix
print(confusion_matrix(y_test, predictions))

[[583 165]
 [115 209]]


In [20]:
# Feature importance
importances = rf_model.feature_importances_
importances

array([0.10206373, 0.13403112, 0.16221027, 0.210498  , 0.12484626,
       0.0181703 , 0.10348361, 0.03964558, 0.0459009 , 0.05915023])

In [21]:
# Sort and display feature importances
sorted(zip(importances, X.columns), reverse=True)

[(0.21049799903862806, 'Age'),
 (0.16221027176955208, 'PaymentTier'),
 (0.1340311229083998, 'JoiningYear'),
 (0.12484625614214079, 'Gender'),
 (0.1034836064821263, 'ExperienceInCurrentDomain'),
 (0.1020637258909569, 'Education'),
 (0.05915022972949393, 'City_Pune'),
 (0.04590090430910557, 'City_New Delhi'),
 (0.03964558333474277, 'City_Bangalore'),
 (0.018170300394853865, 'EverBenched')]