In [101]:
#importing required libraries
import pandas as pd
import numpy as np

In [102]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_squared_error, r2_score

In [103]:
#loading the cleaned dataset
df = pd.read_csv("/content/cleaned_mental_health_data.csv")

In [104]:
df.select_dtypes(include='object').columns

Index(['Gender', 'Country', 'JobRole', 'Department', 'RemoteWork',
       'HasMentalHealthSupport', 'HasTherapyAccess', 'SalaryRange'],
      dtype='object')

In [105]:
ordinal_cols = ['JobRole', 'Department', 'HasMentalHealthSupport', 'HasTherapyAccess']
df[ordinal_cols] = OrdinalEncoder().fit_transform(df[ordinal_cols])

In [106]:
categorical = df.select_dtypes(include='object').columns
nominal_cols = [col for col in categorical if col not in ordinal_cols]

In [107]:
df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)

In [108]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('StressLevel')
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [109]:
df['Stress_WorkHours'] = df['StressLevel'] * df['WorkHoursPerWeek']
df['Sleep_per_Stress'] = df['SleepHours'] / (df['StressLevel'] + 1e-5)

In [110]:
X = df.drop('StressLevel', axis=1)
y = df['StressLevel']

In [111]:
mi = mutual_info_regression(X, y)
mi_scores = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print("Top Features by Mutual Info:")
print(mi_scores)

Top Features by Mutual Info:
Stress_WorkHours            0.636768
Sleep_per_Stress            0.469165
Country_India               0.011532
Department                  0.011072
SalaryRange_40K-60K         0.010636
CareerGrowthScore           0.009509
Country_Brazil              0.007968
SleepHours                  0.007603
ManagerSupportScore         0.007469
Country_Germany             0.006821
SalaryRange_<40K            0.005928
YearsAtCompany              0.003372
Country_UK                  0.002615
RemoteWork_Yes              0.001926
ProductivityScore           0.000196
EmployeeID                  0.000000
JobRole                     0.000000
Age                         0.000000
JobSatisfaction             0.000000
BurnoutLevel                0.000000
PhysicalActivityHrs         0.000000
Gender_Male                 0.000000
BurnoutRisk                 0.000000
TeamSize                    0.000000
WorkLifeBalanceScore        0.000000
HasTherapyAccess            0.000000
MentalHea

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [113]:
# linear regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_preds = linear_model.predict(X_test)
print("Linear Regression")
print("MSE:", mean_squared_error(y_test, linear_preds))
print("R² Score:", r2_score(y_test, linear_preds))

Linear Regression
MSE: 7.025201529395068
R² Score: -0.0608217531473243


In [114]:
# ridge regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_preds = ridge_model.predict(X_test)
print("Ridge Regression")
print("MSE:", mean_squared_error(y_test, ridge_preds))
print("R² Score:", r2_score(y_test, ridge_preds))

Ridge Regression
MSE: 7.024004012958613
R² Score: -0.06064092538313459


In [115]:
# lasso regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
lasso_preds = lasso_model.predict(X_test)
print("Lasso Regression")
print("MSE:", mean_squared_error(y_test, lasso_preds))
print("R² Score:", r2_score(y_test, lasso_preds))

Lasso Regression
MSE: 6.650848452146319
R² Score: -0.004293568718527041
