In [86]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
import itertools
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

pd.set_option('display.max_columns', None)

In [87]:
# Load data and preprocess
df = pd.read_csv("..\\employee_survey.csv")
df.drop(columns=['EmpID'], inplace=True)

ordinal_features = ['EduLevel', 'JobLevel']
nominal_features = ['Gender', 'MaritalStatus', 'Dept', 'EmpType', 'CommuteMode']

edu_levels = ['High School', 'Bachelor', 'Master', 'PhD']
job_levels = ['Intern/Fresher', 'Junior', 'Mid', 'Senior', 'Lead']
ordinal_mappings = [edu_levels, job_levels]

ordinal_encoder = OrdinalEncoder(categories=ordinal_mappings)
df[ordinal_features] = ordinal_encoder.fit_transform(df[ordinal_features])
df = pd.get_dummies(df, columns=nominal_features, drop_first=True)
bool_cols = [i for i in df.columns if df[i].dtype == 'bool']
df[bool_cols] = df[bool_cols].astype(int)

In [88]:
X = df.drop(columns=['JobSatisfaction'])
y = df['JobSatisfaction']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [89]:
def lasso_feature_selection(X_train_fs, y_train_fs, alpha=0.2):
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train_fs, y_train_fs)
    coefficients = lasso.coef_
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})
    feature_importance['Absolute Coefficient'] = abs(feature_importance['Coefficient'])
    feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)
    best_features = feature_importance.loc[feature_importance['Absolute Coefficient'] > 0]['Feature'].tolist()
    print(feature_importance)
    return best_features

In [90]:
best_features = lasso_feature_selection(X_train, y_train)
X_lasso = df[best_features]
X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(X_lasso, y, test_size=0.2, stratify=y, random_state=42)

                         Feature  Coefficient  Absolute Coefficient
3                            WLB     0.136219              0.136219
6                       Workload    -0.126330              0.126330
4                        WorkEnv     0.124710              0.124710
7                         Stress    -0.086818              0.086818
8                     SleepHours     0.017716              0.017716
0                            Age    -0.001341              0.001341
27                    Dept_Sales     0.000000              0.000000
22                       Dept_HR     0.000000              0.000000
23                       Dept_IT    -0.000000              0.000000
24                    Dept_Legal    -0.000000              0.000000
25                Dept_Marketing    -0.000000              0.000000
26               Dept_Operations     0.000000              0.000000
30               CommuteMode_Car     0.000000              0.000000
28             EmpType_Full-Time     0.000000   

In [91]:
scaler = StandardScaler()

# Scale Original Training and Test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scale Lasso Training and Test Data
X_train_lasso_scaled = scaler.fit_transform(X_train_lasso)
X_test_lasso_scaled = scaler.transform(X_test_lasso)

In [92]:
# Hyperparameter tuning function
def tune_and_evaluate(X_tr, y_tr, X_te, y_te, title):
    param_grid = {
    'n_neighbors': [3, 5, 24, 30],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
    }

    print(f"\nTuning Hyperparameters for {title}...")
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_tr, y_tr)
    print(f"Best Parameters: {grid_search.best_params_}")

    model = KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'], weights=grid_search.best_params_['weights'], metric=grid_search.best_params_['metric'])
    model.fit(X_tr, y_tr)
    y_pr = model.predict(X_te)
    acc = accuracy_score(y_te, y_pr)
    print(f"\nAccuracy ({title}): {acc:.4f}")
    print(classification_report(y_te, y_pr, zero_division=0))

In [93]:
# Evaluate models
tune_and_evaluate(X_train_scaled, y_train, X_test_scaled, y_test, "Original")
tune_and_evaluate(X_train_lasso_scaled, y_train_lasso, X_test_lasso_scaled, y_test_lasso, "Lasso")
print(best_features)


Tuning Hyperparameters for Original...
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 24, 'weights': 'uniform'}

Accuracy (Original): 0.4529
              precision    recall  f1-score   support

           1       0.41      0.14      0.21        84
           2       0.33      0.02      0.03        55
           3       0.40      0.16      0.23       113
           4       0.46      0.95      0.62       250
           5       0.55      0.06      0.11       103

    accuracy                           0.45       605
   macro avg       0.43      0.27      0.24       605
weighted avg       0.44      0.45      0.35       605


Tuning Hyperparameters for Lasso...
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 30, 'weights': 'uniform'}

Accuracy (Lasso): 0.5174
              precision    recall  f1-score   support

           1       0.52      0.54      0.53        84
           2       0.00      0.00      0.00        55
           3       0.37      0.16      0.22       11