In [5]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Display all columns in DataFrame
pd.set_option('display.max_columns', None)

# Load dataset
df = pd.read_csv("../employee_survey.csv")

# Drop unnecessary identifier
df.drop(columns=['EmpID'], inplace=True)

# Define feature types
ordinal_features = ['EduLevel', 'JobLevel']
nominal_features = ['Gender', 'MaritalStatus', 'Dept', 'EmpType', 'CommuteMode']

# Define mappings for ordinal encoding
edu_levels = ['High School', 'Bachelor', 'Master', 'PhD']
job_levels = ['Intern/Fresher', 'Junior', 'Mid', 'Senior', 'Lead']
ordinal_mappings = [edu_levels, job_levels]

# Apply ordinal encoding
ordinal_encoder = OrdinalEncoder(categories=ordinal_mappings)
df[ordinal_features] = ordinal_encoder.fit_transform(df[ordinal_features])

# One-hot encode nominal features
df = pd.get_dummies(df, columns=nominal_features, drop_first=True)

# Convert boolean columns to integers
bool_cols = [col for col in df.columns if df[col].dtype == 'bool']
df[bool_cols] = df[bool_cols].astype(int)

# Separate feature matrix and target
X = df.drop(columns=['JobSatisfaction'])
y = df['JobSatisfaction']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Lasso feature selection function
def lasso_feature_selection(X_train_fs, y_train_fs, alpha=0.05):
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train_fs, y_train_fs)
    coefficients = lasso.coef_
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': coefficients
    })
    feature_importance['Absolute Coefficient'] = abs(feature_importance['Coefficient'])
    feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)
    best_features = feature_importance[feature_importance['Absolute Coefficient'] > 0]['Feature'].tolist()
    print("Lasso Feature Importance:\n", feature_importance)
    return best_features

# Select features using Lasso
best_features = lasso_feature_selection(X_train, y_train)

# Create feature-reduced dataset
X_lasso = df[best_features]
X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(
    X_lasso, y, test_size=0.2, stratify=y, random_state=42
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_lasso_scaled = scaler.fit_transform(X_train_lasso)
X_test_lasso_scaled = scaler.transform(X_test_lasso)

# Plot and save confusion matrix
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues, filename="confusion_matrix.png"):
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Train and evaluate multinomial logistic regression
def tune_and_evaluate_logreg(X_tr, y_tr, X_te, y_te, title):
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs'],
        'max_iter': [500]
    }

    print(f"\nTuning Hyperparameters for {title} (Multinomial Logistic Regression)...")

    grid_search = GridSearchCV(
        LogisticRegression(),
        param_grid,
        cv=StratifiedKFold(5),
        scoring='accuracy',
        return_train_score=True
    )
    grid_search.fit(X_tr, y_tr)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    best_score_std = grid_search.cv_results_['std_test_score'][grid_search.best_index_]

    print(f"\nBest Parameters for {title}: {best_params}")
    print(f"Mean CV Accuracy: {best_score:.4f}")
    print(f"CV Standard Deviation: {best_score_std:.4f}")
    print(f"Cross-Validation Error (1 - Accuracy): {1 - best_score:.4f}")

    model = LogisticRegression(**best_params)
    model.fit(X_tr, y_tr)
    y_pr = model.predict(X_te)

    acc = accuracy_score(y_te, y_pr)
    print(f"\nTest Set Accuracy ({title}): {acc:.4f}")
    print(classification_report(y_te, y_pr, zero_division=0))

    plot_confusion_matrix(
        y_te, y_pr,
        classes=y.sort_values().unique().tolist(),
        title=f'Confusion Matrix - {title}',
        filename=f'confusion_matrix_logreg_{title}.png'
    )

    return best_params, best_score

# Evaluate both models
best_params_original, best_score_original = tune_and_evaluate_logreg(X_train_scaled, y_train, X_test_scaled, y_test, "Original")
best_params_lasso, best_score_lasso = tune_and_evaluate_logreg(X_train_lasso_scaled, y_train_lasso, X_test_lasso_scaled, y_test_lasso, "Lasso")

# Summary
print("\nSummary of Results:")
print("Original Model:")
print("  Best Parameters:", best_params_original)
print("  CV Accuracy:", round(best_score_original, 4))
print("  CV Error:", round(1 - best_score_original, 4))

print("\nLasso-Selected Model:")
print("  Best Parameters:", best_params_lasso)
print("  CV Accuracy:", round(best_score_lasso, 4))
print("  CV Error:", round(1 - best_score_lasso, 4))

print("\nSelected Features by Lasso:\n", best_features)

# Evaluate both models and get best parameters + CV scores
best_params_original, best_score_original = tune_and_evaluate_logreg(
    X_train_scaled, y_train, X_test_scaled, y_test, "Original"
)
best_params_lasso, best_score_lasso = tune_and_evaluate_logreg(
    X_train_lasso_scaled, y_train_lasso, X_test_lasso_scaled, y_test_lasso, "Lasso"
)

# Summary of Lasso Results
print("\n" + "="*60)
print("Final Model Summary")
print("="*60)

print("Lasso-Selected Features:")
print(best_features)

print("Multinomial Logistic Regression — Original Features:")
print(f"Best Hyperparameters: {best_params_original}")
print(f"CV Accuracy: {best_score_original:.4f}")
print(f"CV Error: {1 - best_score_original:.4f}")

print("Multinomial Logistic Regression — Lasso-Selected Features:")
print(f"Best Hyperparameters: {best_params_lasso}")
print(f"CV Accuracy: {best_score_lasso:.4f}")
print(f"CV Error: {1 - best_score_lasso:.4f}")




Lasso Feature Importance:
                          Feature  Coefficient  Absolute Coefficient
7                         Stress    -0.222587              0.222587
3                            WLB     0.209356              0.209356
6                       Workload    -0.205787              0.205787
4                        WorkEnv     0.201802              0.201802
8                     SleepHours     0.178861              0.178861
14                        haveOT    -0.136476              0.136476
0                            Age    -0.004344              0.004344
11                      TeamSize     0.003115              0.003115
15          TrainingHoursPerYear     0.000223              0.000223
23                       Dept_IT    -0.000000              0.000000
24                    Dept_Legal    -0.000000              0.000000
25                Dept_Marketing    -0.000000              0.000000
26               Dept_Operations     0.000000              0.000000
29             EmpTyp