### Import the necessary libraries and set the working directory

In [32]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import graphviz

# Set working directory
os.chdir(r'D:\KDG\2024-2025\Semester 1\DAI5\GroupProject\Resources')

missing_values = ['n/a', 'na', 'nan', 'N/A', 'NA', 'NaN', 'NAN', '--', 'Missing']
df = pd.read_csv('cleaned_employees.csv', na_values=missing_values, sep=',', decimal='.')

pd.set_option('display.max_columns', None)  # to see all the columns
print(df.head())
print(df.info())

      EmpID  Gender       Age MaritalStatus        JobLevel  Experience  \
0 -1.727136    Male -0.363508       Married             Mid   -0.294224   
1 -1.721410  Female -0.167125       Married             Mid    0.411979   
2 -1.696216  Female -1.247231        Single  Intern/Fresher   -1.141666   
3 -1.711104  Female -0.658082       Married          Junior   -0.435464   
4 -1.701942   Other -1.247231        Single          Junior   -1.141666   

        Dept    EmpType  PhysicalActivityHours  Workload  Stress  SleepHours  \
0         IT  Full-Time               0.494266         2       1    0.602432   
1    Finance  Full-Time              -0.234700         2       2    0.907144   
2  Marketing  Full-Time               0.077714         5       4   -0.514846   
3         IT   Contract              -0.130562         3       1    0.500862   
4      Sales  Part-Time               1.743922         2       1   -2.139977   

        CommuteMode  CommuteDistance  TeamSize  NumReports  haveOT  

### Convert the object variables to categories

In [33]:
df['Gender'] = df['Gender'].astype(pd.CategoricalDtype(categories=df['Gender'].unique()))
df['MaritalStatus'] = df['MaritalStatus'].astype(pd.CategoricalDtype(categories=df['MaritalStatus'].unique()))
df['Dept'] = df['Dept'].astype(pd.CategoricalDtype(categories=df['Dept'].unique()))
df['EmpType'] = df['EmpType'].astype(pd.CategoricalDtype(categories=df['EmpType'].unique()))
df['CommuteMode'] = df['CommuteMode'].astype(pd.CategoricalDtype(categories=df['CommuteMode'].unique()))
df['haveOT'] = df['haveOT'].astype(pd.CategoricalDtype(categories=df['haveOT'].unique()))

df['JobLevel'] = df['JobLevel'].astype(
    pd.CategoricalDtype(categories=['Intern/Fresher', 'Junior', 'Mid', 'Senior', 'Lead'], ordered=True))

df['Workload'] = df['Workload'].astype(
    pd.CategoricalDtype(categories=[1, 2, 3, 4, 5], ordered=True))

df['Stress'] = df['Stress'].astype(
    pd.CategoricalDtype(categories=[1, 2, 3, 4, 5], ordered=True))

df['JobSatisfaction'] = df['JobSatisfaction'].astype(
    pd.CategoricalDtype(categories=[1, 2, 3, 4, 5], ordered=True))

print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3007 entries, 0 to 3006
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   EmpID                  3007 non-null   float64 
 1   Gender                 3007 non-null   category
 2   Age                    3007 non-null   float64 
 3   MaritalStatus          3007 non-null   category
 4   JobLevel               3007 non-null   category
 5   Experience             3007 non-null   float64 
 6   Dept                   3007 non-null   category
 7   EmpType                3007 non-null   category
 8   PhysicalActivityHours  3007 non-null   float64 
 9   Workload               3007 non-null   category
 10  Stress                 3007 non-null   category
 11  SleepHours             3007 non-null   float64 
 12  CommuteMode            3007 non-null   category
 13  CommuteDistance        3007 non-null   float64 
 14  TeamSize               3007 non-null   f

### Select the relevant features and target variable

In [34]:
# Select the relevant features and target
X = df[['Workload', 'SleepHours', 'Stress']]
y = df['JobSatisfaction']

### Split the dataset into train, test and validation

random_state is a parameter that controls the randomness of the data splitting process. It ensures that the train-test split or any other randomized operation will yield reproducible results.

  Why it's used: Without random_state, every time you run the code, the dataset will be split differently. By setting random_state to a specific integer (like 42), the split will always be the same each time the code is executed, which is useful for debugging, sharing experiments, or comparing results.
    Why 42?: It's just a common convention in data science (a reference to The Hitchhiker's Guide to the Galaxy), but any fixed number can be used.

In [35]:
# Train-test-validation split (60% train, 20% validation, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

### Apply the Decision Tree model

In [36]:
# Initialize the DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=42)

max_deph: Limits how deep the tree can grow. A deeper tree can fit more complex patterns, but it may also fit noise in the training data, leading to overfitting.

min_samples_split: Determines the minimum number of samples required to split an internal node. A higher value can reduce overfitting by preventing splits that create nodes with very few samples.

min_samples_leaf: Sets the minimum number of samples that must be present in a leaf node. Increasing this number helps smooth the model and reduce overfitting.

In [37]:
# Hyperparameter tuning for DecisionTreeRegressor using GridSearchCV
param_grid_dt = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

GridSearchCV: This method performs an exhaustive search over the specified hyperparameter values in param_grid_dt. It trains a new model for every combination of hyperparameters.

 cv=5: This means that 5-fold cross-validation is used, where the training set is divided into 5 subsets. The model is trained on 4 of the subsets and tested on the remaining subset. This process is repeated 5 times, with each subset serving as the test set once. This helps in assessing the model's performance more reliably.

n_jobs=-1: This parameter tells GridSearchCV to use all available processors to perform the computations in parallel, speeding up the tuning process.

verbose=1: This controls the verbosity of the output during the fitting process. A value of 1 will provide basic updates on the progress.


### What Happens Behind the Scenes?

   Cross-Validation: For each combination of hyperparameters, the training data is split into different folds (e.g., 5 folds). The model is trained on a subset of these folds and validated on the remaining fold. This process is repeated for each fold, and the results are averaged.

   Performance Metrics: After training and validation, the mean performance (like MSE) is calculated for each hyperparameter set.

   Final Model: After identifying the best combination of hyperparameters, the best_estimator_ is created with those parameters and is used for predictions on the test set.

In [38]:
# GridSearchCV for DecisionTreeRegressor
grid_search_dt = GridSearchCV(estimator=dt_clf, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


### Evaluate on validation set

In [39]:
y_val_pred_dt = grid_search_dt.predict(X_val)
val_acc_dt = accuracy_score(y_val, y_val_pred_dt)
print(f"Validation Accuracy (Decision Tree): {val_acc_dt}")
# Setting zero_division=0 will replace undefined metrics with 0 instead of issuing a warning.
print(classification_report(y_val, y_val_pred_dt,zero_division=0))
print(confusion_matrix(y_val, y_val_pred_dt))

# Best Decision Tree model
best_dt = grid_search_dt.best_estimator_

# Evaluate the final model on the test set
y_test_pred_dt = best_dt.predict(X_test)
test_acc_dt = accuracy_score(y_test, y_test_pred_dt)
print(f"Test Accuracy (Decision Tree): {test_acc_dt}")
# Setting zero_division=0 will replace undefined metrics with 0 instead of issuing a warning.
print(classification_report(y_test, y_test_pred_dt,zero_division=0))
print(confusion_matrix(y_test, y_test_pred_dt))

Validation Accuracy (Decision Tree): 0.4152823920265781
              precision    recall  f1-score   support

           1       0.31      0.15      0.20        82
           2       0.00      0.00      0.00        50
           3       0.25      0.09      0.13       120
           4       0.44      0.93      0.60       244
           5       0.00      0.00      0.00       106

    accuracy                           0.42       602
   macro avg       0.20      0.23      0.19       602
weighted avg       0.27      0.42      0.29       602

[[ 12   0  19  51   0]
 [  1   0   2  47   0]
 [ 12   0  11  97   0]
 [ 11   0   6 227   0]
 [  3   0   6  97   0]]
Test Accuracy (Decision Tree): 0.4269102990033223
              precision    recall  f1-score   support

           1       0.26      0.11      0.16        80
           2       0.00      0.00      0.00        60
           3       0.24      0.10      0.14       100
           4       0.45      0.93      0.61       256
           5      

### Visualize the decision tree

In [40]:
# Export the tree to dot format
dot_data = export_graphviz(best_dt, out_file=None, 
                           feature_names=X.columns,  
                           filled=True, rounded=True,  
                           special_characters=True)  

# Create a graph from the dot data
graph = graphviz.Source(dot_data)  
graph.render("decision_tree") 
graph.view()  # This will open the visualized tree in your default viewer


'decision_tree.pdf'

### Apply Random Forest Regressor to improve performance

In [42]:
# Initialize the RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning for RandomForestClassifier
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV for RandomForestClassifier
grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred_rf = grid_search_rf.predict(X_val)
val_acc_rf = accuracy_score(y_val, y_val_pred_rf)
print(f"Validation Accuracy (Random Forest): {val_acc_rf}")
# Setting zero_division=0 will replace undefined metrics with 0 instead of issuing a warning.
print(classification_report(y_val, y_val_pred_rf,zero_division=0))
print(confusion_matrix(y_val, y_val_pred_rf))

# Best RandomForest model
best_rf = grid_search_rf.best_estimator_

# Evaluate the final model on the test set
y_test_pred_rf = best_rf.predict(X_test)
test_acc_rf = accuracy_score(y_test, y_test_pred_rf)
print(f"Test Accuracy (Random Forest): {test_acc_rf}")
# Setting zero_division=0 will replace undefined metrics with 0 instead of issuing a warning.
print(classification_report(y_test, y_test_pred_rf,zero_division=0))
print(confusion_matrix(y_test, y_test_pred_rf))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Validation Accuracy (Random Forest): 0.4318936877076412
              precision    recall  f1-score   support

           1       0.54      0.23      0.32        82
           2       0.00      0.00      0.00        50
           3       0.20      0.01      0.02       120
           4       0.43      0.98      0.60       244
           5       0.00      0.00      0.00       106

    accuracy                           0.43       602
   macro avg       0.23      0.24      0.19       602
weighted avg       0.29      0.43      0.29       602

[[ 19   0   0  63   0]
 [  1   0   0  49   0]
 [ 12   0   1 107   0]
 [  2   0   2 240   0]
 [  1   0   2 103   0]]
Test Accuracy (Random Forest): 0.43521594684385384
              precision    recall  f1-score   support

           1       0.41      0.17      0.25        80
           2       0.00      0.00      0.00        60
           3       0.00      0.00      0.00       100
        