In [1]:
# Importing Necessary Libraries
import pandas as pd
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, r2_score

pd.set_option('display.max_columns', None)

In [2]:
# Load data and preprocess
df = pd.read_csv("..\\employee_survey.csv")
df.drop(columns=['EmpID'], inplace=True)

ordinal_features = ['EduLevel', 'JobLevel']
nominal_features = ['Gender', 'MaritalStatus', 'Dept', 'EmpType', 'CommuteMode']

edu_levels = ['High School', 'Bachelor', 'Master', 'PhD']
job_levels = ['Intern/Fresher', 'Junior', 'Mid', 'Senior', 'Lead']
ordinal_mappings = [edu_levels, job_levels]

ordinal_encoder = OrdinalEncoder(categories=ordinal_mappings)
df[ordinal_features] = ordinal_encoder.fit_transform(df[ordinal_features])
df = pd.get_dummies(df, columns=nominal_features, drop_first=True)
bool_cols = [i for i in df.columns if df[i].dtype == 'bool']
df[bool_cols] = df[bool_cols].astype(int)

In [3]:
X = df.drop(columns=['JobSatisfaction'])
y = df['JobSatisfaction']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def lasso_feature_selection(X_train_fs, y_train_fs):
    # Use LassoCV to find the optimal alpha for accuracy
    lasso_cv = LassoCV(cv=5, random_state=42)
    lasso_cv.fit(X_train, y_train)

    # Get the best alpha
    best_alpha = lasso_cv.alpha_

    lasso = Lasso(alpha=best_alpha)
    lasso.fit(X_train_fs, y_train_fs)
    coefficients = lasso.coef_
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})
    feature_importance['Absolute Coefficient'] = abs(feature_importance['Coefficient'])
    feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)
    best_features = feature_importance.loc[feature_importance['Absolute Coefficient'] > 0]['Feature'].tolist()
    X_lasso = df[best_features]
    X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(X_lasso, y, test_size=0.2, random_state=42)
    return feature_importance, X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso

feature_importance, X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = lasso_feature_selection(X_train, y_train)

In [5]:
scaler = StandardScaler()

# Scale Original Training and Test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scale Lasso Training and Test Data
X_train_lasso_scaled = scaler.fit_transform(X_train_lasso)
X_test_lasso_scaled = scaler.transform(X_test_lasso)

In [6]:
# Hyperparameter tuning function
def tune_and_evaluate(X_tr, y_tr, X_te, y_te, title):
    param_grid = {
        'C': [0.001, 0.1, 1, 10, 100],
        'gamma': ['scale', 100, 10, 1, 0.1, 0.01, 0.001],
        'kernel': ['rbf']
    }
    print(f"\nTuning Hyperparameters for {title}...")
    grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_tr, y_tr)
    print(f"Best Parameters: {grid_search.best_params_}")

    model = SVC(C=grid_search.best_params_['C'], gamma=grid_search.best_params_['gamma'], kernel=grid_search.best_params_['kernel'])
    model.fit(X_tr, y_tr)
    y_pr = model.predict(X_te)
    acc = accuracy_score(y_te, y_pr)
    r2 = r2_score(y_te, y_pr)
    print(f"\nAccuracy ({title}): {acc:.4f}")
    print(f"\nR2 ({title}): {r2:.4f}")
    print(classification_report(y_te, y_pr))

In [7]:
# Evaluate models
tune_and_evaluate(X_train_scaled, y_train, X_test_scaled, y_test, "Original")
tune_and_evaluate(X_train_lasso_scaled, y_train_lasso, X_test_lasso_scaled, y_test_lasso, "Lasso")
print(feature_importance)


Tuning Hyperparameters for Original...
Best Parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

Accuracy (Original): 0.5223

R2 (Original): 0.0836
              precision    recall  f1-score   support

           1       0.51      0.51      0.51        94
           2       0.00      0.00      0.00        51
           3       0.28      0.27      0.28       102
           4       0.58      0.94      0.72       254
           5       0.00      0.00      0.00       104

    accuracy                           0.52       605
   macro avg       0.27      0.35      0.30       605
weighted avg       0.37      0.52      0.43       605


Tuning Hyperparameters for Lasso...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}

Accuracy (Lasso): 0.5636

R2 (Lasso): 0.1112
              precision    recall  f1-score   support

           1       0.53      0.68      0.60        94
           2       0.00      0.00      0.00        51
           3       0.39      0.22      0.28       102
           4       0.59      0.96      0.73       254
           5       0.80      0.12      0.20       104

    accuracy                           0.56       605
   macro avg       0.46      0.39      0.36       605
weighted avg       0.53      0.56      0.48       605

                         Feature  Coefficient  Absolute Coefficient
14                        haveOT    -0.348948              0.348948
7                         Stress    -0.248787              0.248787
4                        WorkEnv     0.232809              0.232809
3                            WLB     0.220164              0.220164
6                       Workload    -0.219188              0.21918

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
