# Using KNN Model to Predict Employee Performance

Importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split , RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint
import optuna
pd.set_option("display.max_columns", None)

Read Data

In [None]:
df = pd.read_csv('../data/processed/HRDataset_p_v4.csv', index_col=0)
df.head()

In [3]:

categorical_features = [
    'GenderID',
    'FromDiversityJobFairID',
    'State',
    'CitizenDesc',
    'HispanicLatino',
    'RaceDesc',
    'Department',
    'ManagerName',
    'RecruitmentSource',
    'HireYear',
    'HireMonth'
]
numeric_features = [
    'Salary',
    'EmpSatisfaction',
    'SpecialProjectsCount',
    'DaysLateLast30',
    'Absences',
    'Age',
    'NumberOfColleagues'
]
label = 'PerfScoreID'

In [None]:
df.shape

Split the Data

In [5]:

# Separate features and labels
X = df.drop(columns=[label])
y = df[label]

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Encode and Scale

In [6]:

# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # we don't have missing numerical values
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

In [None]:
# Define a pipeline for KNN classification
knn_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # Using KNN classifier
])

# Fit the model
knn_model.fit(X_train, y_train)

# Make predictions
y_pred = knn_model.predict(X_test)

accuracy = np.mean(y_pred == y_test)

# Log accuracy and classification report
print(f"KNN Accuracy: {accuracy}")
print(f"KNN Classification Report:")
print(classification_report(y_test, y_pred))


# Model optimization

Useing Random Search

In [None]:

# Define the hyperparameter space for RandomizedSearchCV
param_dist = {
    'classifier__n_neighbors': randint(1, 20),  # Random integers from 1 to 20
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan']
}

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(knn_model, param_dist, n_iter=10, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Output the best parameters
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)

# Optionally, retrain the model with the best parameters
best_knn_model = random_search.best_estimator_

# Evaluate the best model
y_pred = best_knn_model.predict(X_test)

accuracy_random = accuracy_score(y_test, y_pred)
print(f"KNN Test Accuracy: {accuracy_random:.2f}")
print("KNN Classification Report:")
print(classification_report(y_test, y_pred))

Useing Grid Search

In [None]:

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__n_neighbors': np.arange(1, 21),  # Values from 1 to 20
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan']
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(knn_model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Output the best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Optionally, retrain the model with the best parameters
best_knn_model = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_knn_model.predict(X_test)

accuracy_grid = accuracy_score(y_test, y_pred)
print(f"KNN Test Accuracy: {accuracy_grid:.2f}")
print("KNN Classification Report:")
print(classification_report(y_test, y_pred))

Using Optuna

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan'])

    # Create the KNN model
    knn_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric))
    ])

    # Fit the model
    knn_model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    y_pred = knn_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Output the best parameters
print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)

# Optionally, retrain the model with the best parameters
best_params = study.best_params
best_knn_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(
        n_neighbors=best_params['n_neighbors'],
        weights=best_params['weights'],
        metric=best_params['metric']
    ))
])

# Fit and evaluate the best model
best_knn_model.fit(X_train, y_train)
y_pred = best_knn_model.predict(X_test)


accuracy_optuna = accuracy_score(y_test, y_pred)
print(f"KNN Test Accuracy: {accuracy_optuna:.2f}")
# Log classification report
print("KNN Classification Report:")
print(classification_report(y_test, y_pred))