In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Load the dataset
data = pd.read_csv('HR_comma_sep.csv')
print("Dataset Loaded Successfully")

Dataset Loaded Successfully


In [3]:
# Check for missing values and print columns with missing data
missing_values = data.isnull().sum()
print("\nMissing Values per column:\n", missing_values[missing_values > 0])

# Handle missing values by filling with median for numeric columns only
# Select only numeric columns
numeric_data = data.select_dtypes(include=np.number)

# Calculate median for numeric columns
numeric_medians = numeric_data.median()

# Fill missing values in numeric columns with their respective medians
data[numeric_data.columns] = data[numeric_data.columns].fillna(numeric_medians)

print("\nMissing values in numeric columns handled with median replacement.")

# For categorical columns, you might want to fill with the most frequent value (mode)
# or a separate strategy
categorical_columns = data.select_dtypes(exclude=np.number).columns

for col in categorical_columns:
    mode_value = data[col].mode()[0]  # Get the most frequent value
    data[col].fillna(mode_value, inplace=True)

print("\nMissing values in categorical columns handled with mode replacement.")


Missing Values per column:
 Series([], dtype: int64)

Missing values in numeric columns handled with median replacement.

Missing values in categorical columns handled with mode replacement.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(mode_value, inplace=True)


In [4]:
# This part cover the visualization parts of the dataset
# data description
print(data.head())
print(data.info())
print(data.describe(include ='all'))

   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.38             0.53               2                   157   
1                0.80             0.86               5                   262   
2                0.11             0.88               7                   272   
3                0.72             0.87               5                   223   
4                0.37             0.52               2                   159   

   time_spend_company  Work_accident  left  promotion_last_5years Departments  \
0                   3              0     1                      0       sales   
1                   6              0     1                      0       sales   
2                   4              0     1                      0       sales   
3                   5              0     1                      0       sales   
4                   3              0     1                      0       sales   

   salary  
0     low  
1  mediu

In [5]:
#  Import LabelEncoder & creating labelEncoder
le = preprocessing.LabelEncoder()

# Converting string labels into numbers (ordinal scale).
# - original values in string
data['salary']=le.fit_transform(data['salary'])
data['Departments']=le.fit_transform(data['Departments'])

In [6]:
from sklearn.metrics import accuracy_score, classification_report
import warnings
# Spliting data into Feature (X) and target (T)
# X represent features or variables
X=data[['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company',
     'Work_accident','promotion_last_5years', 'Departments', 'salary']]
#Y represents target
y=data['left']
# Split data into Features (X) and Target (y)
X = data[['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours',
          'time_spend_company', 'Work_accident', 'promotion_last_5years']]
y = data['left']

In [7]:
# Splitting the data (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train and Test Sets Split:")
print("Train set:", X_train.shape)
print("Test set:", X_test.shape)

Train and Test Sets Split:
Train set: (11999, 7)
Test set: (3000, 7)


In [None]:

# Define hyperparameter grids for all classifiers
# K-Nearest Neighbors
knn_param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=3, scoring='accuracy')
knn_grid.fit(X_train, y_train)
knn_best = knn_grid.best_estimator_
print(f"KNN Best Parameters: {knn_grid.best_params_}")

# Random Forest
rfc_param_grid = {
    'n_estimators': [10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20]
}
rfc_grid = GridSearchCV(RandomForestClassifier(random_state=7), rfc_param_grid, cv=3, scoring='accuracy')
rfc_grid.fit(X_train, y_train)
rfc_best = rfc_grid.best_estimator_
print(f"Random Forest Best Parameters: {rfc_grid.best_params_}")

# Support Vector Machine
svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
svc_grid = GridSearchCV(SVC(probability=True), svc_param_grid, cv=3, scoring='accuracy')
svc_grid.fit(X_train, y_train)
svc_best = svc_grid.best_estimator_
print(f"SVM Best Parameters: {svc_grid.best_params_}")

# Logistic Regression
lr_param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear']
}
lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_param_grid, cv=3, scoring='accuracy')
lr_grid.fit(X_train, y_train)
lr_best = lr_grid.best_estimator_
print(f"Logistic Regression Best Parameters: {lr_grid.best_params_}")

# Decision Tree
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=7), dt_param_grid, cv=3, scoring='accuracy')
dt_grid.fit(X_train, y_train)
dt_best = dt_grid.best_estimator_
print(f"Decision Tree Best Parameters: {dt_grid.best_params_}")

# Naive Bayes
nb_param_grid = {
    'alpha': [0.1, 1.0, 10.0]
}
nb_grid = GridSearchCV(MultinomialNB(), nb_param_grid, cv=3, scoring='accuracy')
nb_grid.fit(X_train, y_train)
nb_best = nb_grid.best_estimator_
print(f"Naive Bayes Best Parameters: {nb_grid.best_params_}")

# MLP Neural Network
mlp_param_grid = {
    'hidden_layer_sizes': [(6,), (50,), (100,), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [1e-05, 1e-04],
    'max_iter': [1000]
}
mlp_grid = GridSearchCV(MLPClassifier(random_state=1), mlp_param_grid, cv=3, scoring='accuracy')
mlp_grid.fit(X_train, y_train)
mlp_best = mlp_grid.best_estimator_
print(f"MLP Best Parameters: {mlp_grid.best_params_}")

# Combine All Best Models
classifiers = {
    "K-Nearest Neighbors": knn_best,
    "Random Forest": rfc_best,
    "Support Vector Machine": svc_best,
    "Logistic Regression": lr_best,
    "Decision Tree": dt_best,
    "Naive Bayes": nb_best,
    "MLP Neural Network": mlp_best
}

KNN Best Parameters: {'n_neighbors': 7, 'weights': 'distance'}
Random Forest Best Parameters: {'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}


In [None]:
# Apply Bootstrapping to Each Model
bootstrapped_classifiers = {}
for name, clf in classifiers.items():
    bootstrapped_accuracies = []
    bootstrapped_classifiers_list = []
    for _ in range(10):
        X_resampled, y_resampled = resample(X_train, y_train, random_state=_)
        clf.fit(X_resampled, y_resampled)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        bootstrapped_classifiers_list.append(clf)
        bootstrapped_accuracies.append(acc)
    best_index = bootstrapped_accuracies.index(max(bootstrapped_accuracies))
    bootstrapped_classifiers[name] = bootstrapped_classifiers_list[best_index]
    print(f"{name} Bootstrapping Best Accuracy: {max(bootstrapped_accuracies) * 100:.2f}%")


In [None]:
# Voting Classifier Using Bootstrapped Models
voting_clf = VotingClassifier(
    estimators=[(name, clf) for name, clf in bootstrapped_classifiers.items()], voting='hard'
)
voting_clf.fit(X_train, y_train)


In [None]:
# Evaluate Voting Classifier
y_pred_voting = voting_clf.predict(X_test)
voting_acc = accuracy_score(y_test, y_pred_voting)
print(f"\nVoting Classifier Results:")
print(f"Accuracy: {voting_acc * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred_voting))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_voting))

# Save the Final Voting Model
from joblib import dump
dump(voting_clf, "employee_churn_voting_model.joblib")
print("Voting Classifier Model Saved Successfully")