In [6]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit

data = pd.read_csv("./datasets./Tech_Use_Stress_Wellness.csv")

In [None]:
print(data)

      user_id  age  gender  daily_screen_time_hours  phone_usage_hours  \
0           1   53    Male                      6.8                2.9   
1           2   66  Female                      4.1                2.1   
2           3   43    Male                      4.7                3.6   
3           4   29  Female                      6.0                4.5   
4           5   57    Male                      6.7                3.4   
...       ...  ...     ...                      ...                ...   
4995     4996   17  Female                      7.0                2.2   
4996     4997   42  Female                      1.9                1.5   
4997     4998   20  Female                      6.3                1.5   
4998     4999   58    Male                      6.2                3.5   
4999     5000   58  Female                      4.8                1.7   

      laptop_usage_hours  tablet_usage_hours  tv_usage_hours  \
0                    2.1                 0.5   

In [48]:
# If stress is >7 classify as stressed
data['stressed'] = (data['stress_level'] > 7).astype(int)

# Drop all the useless and very relevant metrics
X = data.drop(['user_id', 'stress_level', 'mental_health_score', 'mood_rating', 'weekly_anxiety_score', 'weekly_depression_score', 'stressed'], axis=1)
X_encoded = pd.get_dummies(X, drop_first=True)
y = data['stressed']

# Create a 70/30 split for training and validating
rs = ShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_index, valid_index = next(rs.split(X_encoded))

dtrain = xgb.DMatrix(X_encoded.iloc[train_index], label=y.iloc[train_index])
dvalid = xgb.DMatrix(X_encoded.iloc[valid_index], label=y.iloc[valid_index])

# still some magic numbers right now 
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.05,
    'max_depth': 8,
    'lambda': 0.01,
    'alpha': 0.02,
    'tree_method': 'hist'
}

# need to work on these too
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=500*2,
    evals=[(dtrain, 'train'), (dvalid, 'valid')],
    early_stopping_rounds=50,
    verbose_eval=False
)

y_pred_prob = bst.predict(dvalid)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert to binary labels

results = pd.DataFrame({
    'Actual': y.iloc[valid_index].values,
    'Predicted Probability': y_pred_prob,
    'Predicted Class': y_pred
})

print(bst.feature_names)
print()
print(results.head())
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(results['Actual'], results['Predicted Class']))
print("Confusion Matrix:\n", confusion_matrix(results['Actual'], results['Predicted Class']))
print("Classification Report:\n", classification_report(results['Actual'], results['Predicted Class']))


['age', 'daily_screen_time_hours', 'phone_usage_hours', 'laptop_usage_hours', 'tablet_usage_hours', 'tv_usage_hours', 'social_media_hours', 'work_related_hours', 'entertainment_hours', 'gaming_hours', 'sleep_duration_hours', 'sleep_quality', 'physical_activity_hours_per_week', 'uses_wellness_apps', 'eats_healthy', 'caffeine_intake_mg_per_day', 'mindfulness_minutes_per_day', 'gender_Male', 'gender_Other', 'location_type_Suburban', 'location_type_Urban']

   Actual  Predicted Probability  Predicted Class
0       0               0.000400                0
1       0               0.000440                0
2       0               0.004365                0
3       0               0.000385                0
4       0               0.000367                0
Accuracy: 0.9686666666666667
Confusion Matrix:
 [[1008   28]
 [  19  445]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1036
           1       0.94      0.96 