In [1]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Read in CSV
file_path = "Resources/Employee.csv"
employee_df = pd.read_csv(file_path)
employee_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [3]:
# Remove data points where JoiningYear is 2018
employee_df = employee_df[employee_df['JoiningYear'] != 2018].reset_index(drop=True)
employee_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [4]:
# Define features, dropping the 'City' and 'EverBenched' columns
X = employee_df.drop(columns=['LeaveOrNot', 'City', 'EverBenched'])
X.head()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,Gender,ExperienceInCurrentDomain
0,Bachelors,2017,3,34,Male,0
1,Bachelors,2013,1,28,Female,3
2,Bachelors,2014,3,38,Female,2
3,Masters,2016,3,27,Male,5
4,Masters,2017,3,24,Male,2


In [5]:
# Define target
y = employee_df['LeaveOrNot']
y.head()

0    0
1    1
2    0
3    1
4    1
Name: LeaveOrNot, dtype: int64

In [6]:
# Label Encoding for Education, Gender, and EverBenched
label_encoder = LabelEncoder()
X['Education'] = label_encoder.fit_transform(X['Education'])
X['Gender'] = label_encoder.fit_transform(X['Gender'])
X.head()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,Gender,ExperienceInCurrentDomain
0,0,2017,3,34,1,0
1,0,2013,1,28,0,3
2,0,2014,3,38,0,2
3,1,2016,3,27,1,5
4,1,2017,3,24,1,2


In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Create StandardScaler instance
scaler = StandardScaler()

In [9]:
# Fit and scale the training data
X_train_scaled = scaler.fit_transform(X_train)

In [10]:
# Fit and scale the test data
X_test_scaled = scaler.fit_transform(X_test)

In [11]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

In [12]:
# Fit the model
rf_model.fit(X_train_scaled, y_train)

In [13]:
# Make predictions using the scaled test data
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7677


In [15]:
# Print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       748
           1       0.69      0.43      0.53       324

    accuracy                           0.77      1072
   macro avg       0.74      0.67      0.69      1072
weighted avg       0.76      0.77      0.75      1072



In [16]:
# Print confusion matrix
print(confusion_matrix(y_test, predictions))

[[685  63]
 [186 138]]


In [17]:
# Feature importance
importances = rf_model.feature_importances_
importances

array([0.11272951, 0.17650123, 0.22019621, 0.25742889, 0.10055498,
       0.13258919])

In [18]:
# We can sort the features by their importance
sorted(zip(importances, X.columns), reverse=True)

[(0.25742888697154087, 'Age'),
 (0.22019620789264777, 'PaymentTier'),
 (0.1765012271409142, 'JoiningYear'),
 (0.1325891867552712, 'ExperienceInCurrentDomain'),
 (0.11272951068729953, 'Education'),
 (0.10055498055232649, 'Gender')]