In [None]:
# Importing Necessary Libraries
import pandas as pd
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

pd.set_option('display.max_columns', None)

In [2]:
# Load data and preprocess
df = pd.read_csv("..\\employee_survey.csv")
df.drop(columns=['EmpID'], inplace=True)

ordinal_features = ['EduLevel', 'JobLevel']
nominal_features = ['Gender', 'MaritalStatus', 'Dept', 'EmpType', 'CommuteMode']

edu_levels = ['High School', 'Bachelor', 'Master', 'PhD']
job_levels = ['Intern/Fresher', 'Junior', 'Mid', 'Senior', 'Lead']
ordinal_mappings = [edu_levels, job_levels]

ordinal_encoder = OrdinalEncoder(categories=ordinal_mappings)
df[ordinal_features] = ordinal_encoder.fit_transform(df[ordinal_features])
df = pd.get_dummies(df, columns=nominal_features, drop_first=True)
bool_cols = [i for i in df.columns if df[i].dtype == 'bool']
df[bool_cols] = df[bool_cols].astype(int)

In [3]:
X = df.drop(columns=['JobSatisfaction'])
y = df['JobSatisfaction']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
def lasso_feature_selection(X_train_fs, y_train_fs):
    # Use LassoCV to find the optimal alpha for accuracy
    lasso_cv = LassoCV(cv=5, random_state=42)
    lasso_cv.fit(X_train, y_train)
    
    # Get the best alpha
    best_alpha = lasso_cv.alpha_

    lasso = Lasso(alpha=best_alpha)
    lasso.fit(X_train_fs, y_train_fs)
    coefficients = lasso.coef_
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})
    feature_importance['Absolute Coefficient'] = abs(feature_importance['Coefficient'])
    feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)
    best_features = feature_importance.loc[feature_importance['Absolute Coefficient'] > 0]['Feature'].tolist()
    print(feature_importance)
    return best_features

In [5]:
best_features = lasso_feature_selection(X_train, y_train)
X_lasso = df[best_features]
X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(X_lasso, y, test_size=0.2, stratify=y, random_state=42)

                         Feature  Coefficient  Absolute Coefficient
3                            WLB     0.160960              0.160960
6                       Workload    -0.153053              0.153053
4                        WorkEnv     0.150285              0.150285
7                         Stress    -0.131996              0.131996
8                     SleepHours     0.071275              0.071275
0                            Age    -0.001900              0.001900
1                       JobLevel     0.000000              0.000000
2                     Experience    -0.000000              0.000000
5          PhysicalActivityHours    -0.000000              0.000000
9                CommuteDistance     0.000000              0.000000
10                  NumCompanies    -0.000000              0.000000
11                      TeamSize     0.000000              0.000000
12                    NumReports    -0.000000              0.000000
13                      EduLevel     0.000000   

In [6]:
scaler = StandardScaler()

# Scale Original Training and Test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scale Lasso Training and Test Data
X_train_lasso_scaled = scaler.fit_transform(X_train_lasso)
X_test_lasso_scaled = scaler.transform(X_test_lasso)

In [7]:
# Hyperparameter tuning function
def tune_and_evaluate(X_tr, y_tr, X_te, y_te, title):
    param_grid = {
        'C': [0.001, 0.1, 1, 10, 100],
        'gamma': ['scale', 100, 10, 1, 0.1, 0.01, 0.001],
        'kernel': ['rbf']
    }
    print(f"\nTuning Hyperparameters for {title}...")
    grid_search = GridSearchCV(SVC(), param_grid, cv=StratifiedKFold(5), scoring='accuracy')
    grid_search.fit(X_tr, y_tr)
    print(f"Best Parameters: {grid_search.best_params_}")

    model = SVC(C=grid_search.best_params_['C'], gamma=grid_search.best_params_['gamma'], kernel=grid_search.best_params_['kernel'])
    model.fit(X_tr, y_tr)
    y_pr = model.predict(X_te)
    acc = accuracy_score(y_te, y_pr)
    print(f"\nAccuracy ({title}): {acc:.4f}")
    print(classification_report(y_te, y_pr, zero_division=0))

In [8]:
# Evaluate models
tune_and_evaluate(X_train_scaled, y_train, X_test_scaled, y_test, "Original")
tune_and_evaluate(X_train_lasso_scaled, y_train_lasso, X_test_lasso_scaled, y_test_lasso, "Lasso")
print(best_features)


Tuning Hyperparameters for Original...
Best Parameters: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

Accuracy (Original): 0.5107
              precision    recall  f1-score   support

           1       0.52      0.68      0.59        84
           2       0.00      0.00      0.00        55
           3       0.27      0.15      0.19       113
           4       0.55      0.94      0.69       250
           5       0.00      0.00      0.00       103

    accuracy                           0.51       605
   macro avg       0.27      0.35      0.29       605
weighted avg       0.35      0.51      0.40       605


Tuning Hyperparameters for Lasso...
Best Parameters: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

Accuracy (Lasso): 0.5455
              precision    recall  f1-score   support

           1       0.50      0.73      0.60        84
           2       0.00      0.00      0.00        55
           3       0.35      0.07      0.12       113
           4       0.55      0.94      0.