In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [47]:
df= pd.read_csv(r"C:\Users\kandu\Downloads\mental_health_workplace_survey.csv") ##file

In [48]:
for col in df.select_dtypes(include=['object']).columns:##converting the text featurs into numerical columns.
    df[col] = LabelEncoder().fit_transform(df[col])

In [49]:
X = df.drop(['BurnoutRisk', 'EmployeeID'], axis=1)
y = df['BurnoutRisk']
feature_names = X.columns
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [50]:
##training all features.
rf_full = RandomForestClassifier(random_state=42).fit(X_train, y_train)
dt_full = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
knn_full = KNeighborsClassifier().fit(X_train_scaled, y_train)

In [51]:
##top 3 features.
importances = rf_full.feature_importances_
top_3_features = pd.Series(importances, index=feature_names).nlargest(3).index.tolist()
print(f"\nTop 3 features: {top_3_features}")


Top 3 features: ['BurnoutLevel', 'ProductivityScore', 'StressLevel']


In [52]:
X_top3 = X[top_3_features]
X_train_top3, X_test_top3, y_train_top3, y_test_top3 = train_test_split(X_top3, y, test_size=0.2, random_state=42)
scaler_top3 = StandardScaler()
X_train_top3_scaled = scaler_top3.fit_transform(X_train_top3)
X_test_top3_scaled = scaler_top3.transform(X_test_top3)
#training models only on top 3 features.
rf_top3 = RandomForestClassifier(random_state=42).fit(X_train_top3, y_train_top3)
dt_top3 = DecisionTreeClassifier(random_state=42).fit(X_train_top3, y_train_top3)
knn_top3 = KNeighborsClassifier().fit(X_train_top3_scaled, y_train_top3)

In [53]:
##final comparison.
summary = pd.DataFrame({
    'Model': ['Decision Tree', 'Random Forest', 'k-NN'],
    'Accuracy (All Features)': [
        accuracy_score(y_test, dt_full.predict(X_test)),
        accuracy_score(y_test, rf_full.predict(X_test)),
        accuracy_score(y_test, knn_full.predict(X_test_scaled))
    ],
    'Accuracy (Top 3 Features)': [
        accuracy_score(y_test_top3, dt_top3.predict(X_test_top3)),
        accuracy_score(y_test_top3, rf_top3.predict(X_test_top3)),
        accuracy_score(y_test_top3, knn_top3.predict(X_test_top3_scaled))
    ]
})

print("Before vs. After Feature Selection")
print(summary.to_string(index=False))

Before vs. After Feature Selection
        Model  Accuracy (All Features)  Accuracy (Top 3 Features)
Decision Tree                 1.000000                   1.000000
Random Forest                 1.000000                   1.000000
         k-NN                 0.798333                   0.986667
