# StressSense — Early Stress Risk Screener
_End-to-End ML Notebook (EDA → Preprocessing → 2+ Models → Evaluation → Explainability → Demo)_

**Author:** Your Name  
**Cohort:** Week 3 — Classification with 2+ algorithms  
**Note:** This tool is educational and not a medical diagnosis.


## 1) Project Overview
**Goal:** Predict an individual's **stress level** (Low/Moderate/High) using lifestyle, study/work, and wellbeing indicators.

**You will deliver:**
- Clean EDA & preprocessing
- Train at least **two classifiers** (Logistic Regression + RandomForest; optional GradientBoosting)
- Model evaluation & comparison table
- Explainability (global feature importance; optional SHAP)
- Save artifacts & run a tiny **Gradio** demo


In [None]:
# 2) Setup
!python --version
import sys, os, json, warnings, math
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance
import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Paths
DATA_PATH = '../data/sample_stress_data.csv'  # <-- Replace with your dataset path if needed
MODEL_DIR = '../models'
ARTIFACT_DIR = '../artifacts'
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(ARTIFACT_DIR, exist_ok=True)

print('Using data at:', DATA_PATH)

## 3) Load Data

In [None]:
df = pd.read_csv(DATA_PATH)
print('Shape:', df.shape)
df.head()

## 4) Quick Data Health Check

In [None]:
display(df.describe(include='all').T)
print('\nMissing values per column:')
print(df.isna().sum())

# Drop obvious duplicates, if any
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"Dropped duplicates: {before - after}")

## 5) Define Target and Features

In [None]:
# Try common target names; fallback to last column if not found
possible_targets = ['stress_level', 'Stress Level', 'stress', 'label', 'target']
target_col = None
for c in possible_targets:
    if c in df.columns:
        target_col = c
        break
if target_col is None:
    target_col = df.columns[-1]
print('Target column assumed as:', target_col)

X = df.drop(columns=[target_col])
y = df[target_col].astype('category')

print('Classes:', list(y.cat.categories))
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(exclude=['int64','float64']).columns.tolist()
print('Numeric:', num_cols)
print('Categorical:', cat_cols)

## 6) Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
X_train.shape, X_test.shape

## 7) Preprocessing Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

## 8) Train Multiple Models (2+ classifiers)

In [None]:
models = {
    'LogReg': LogisticRegression(max_iter=1000, multi_class='auto', n_jobs=None if 'n_jobs' in LogisticRegression().get_params() else None),
    'RandomForest': RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE),
    'GradientBoosting': GradientBoostingClassifier(random_state=RANDOM_STATE)
}

fitted = {}
metrics_table = []

for name, clf in models.items():
    pipe = Pipeline(steps=[('preprocess', preprocess), ('clf', clf)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_prob = None
    if hasattr(pipe.named_steps['clf'], 'predict_proba'):
        y_prob = pipe.predict_proba(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    # Macro-average for multi-class
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # ROC-AUC (macro) if proba available and >2 classes use OvR
    auc = np.nan
    try:
        if y_prob is not None:
            # binarize labels for multiclass
            from sklearn.preprocessing import label_binarize
            classes = list(y.cat.categories)
            y_bin = label_binarize(y_test, classes=classes)
            auc = roc_auc_score(y_bin, y_prob, average='macro', multi_class='ovr')
    except Exception as e:
        pass
    
    metrics_table.append([name, acc, prec, rec, f1, auc])
    fitted[name] = pipe
    print(f"""\n=== {name} ===
Accuracy: {acc:.3f} | Precision(macro): {prec:.3f} | Recall(macro): {rec:.3f} | F1(macro): {f1:.3f} | ROC-AUC(macro): {auc if not math.isnan(auc) else 'NA'}
""")
    
# Summary table
mt = pd.DataFrame(metrics_table, columns=['Model','Accuracy','Precision(macro)','Recall(macro)','F1(macro)','ROC-AUC(macro)']).sort_values('F1(macro)', ascending=False)
mt.reset_index(drop=True, inplace=True)
mt

## 9) Pick Best Model

In [None]:
best_row = mt.iloc[0]
best_model_name = best_row['Model']
best_model = fitted[best_model_name]
print('Best model:', best_model_name)
best_model

## 10) Confusion Matrix (Best Model)

In [None]:
y_pred_best = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_best, labels=list(y.cat.categories))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(y.cat.categories))
fig, ax = plt.subplots(figsize=(4.5,4))
disp.plot(ax=ax, xticks_rotation=45)
plt.title(f'Confusion Matrix — {best_model_name}')
plt.show()

## 11) Global Feature Importance

In [None]:
# Try permutation importance (model-agnostic)
try:
    # To compute permutation importance, we need a fitted pipeline.
    # We'll compute on a sample to be fast.
    result = permutation_importance(best_model, X_test, y_test, n_repeats=5, random_state=RANDOM_STATE, n_jobs=-1)
    importances = result.importances_mean
    # Get feature names from ColumnTransformer
    ohe = best_model.named_steps['preprocess'].named_transformers_['cat'].named_steps['ohe'] if len(cat_cols) > 0 else None
    num_names = num_cols
    cat_names = []
    if ohe is not None:
        cat_names = list(ohe.get_feature_names_out(cat_cols))
    feature_names = num_names + cat_names
    
    fi = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False).head(15)
    fi.reset_index(drop=True, inplace=True)
    display(fi)
    
    # Plot
    fig, ax = plt.subplots(figsize=(6,4))
    ax.barh(fi['feature'][::-1], fi['importance'][::-1])
    ax.set_xlabel('Permutation Importance')
    ax.set_title('Top Features')
    plt.tight_layout()
    plt.show()
except Exception as e:
    print('Permutation importance failed:', e)

## 12) Save Artifacts

In [None]:
# Save model pipeline
model_path = f"{MODEL_DIR}/stresssense_best_pipeline.pkl"
joblib.dump(best_model, model_path)
print('Saved:', model_path)

# Save feature types & UI metadata for the demo app
ui_meta = {
    'numeric_cols': num_cols,
    'categorical_cols': {},
    'ranges': {}
}
for c in num_cols:
    lo = float(np.nanpercentile(df[c], 5)) if pd.api.types.is_numeric_dtype(df[c]) else 0.0
    hi = float(np.nanpercentile(df[c], 95)) if pd.api.types.is_numeric_dtype(df[c]) else 1.0
    ui_meta['ranges'][c] = [lo, hi]
for c in cat_cols:
    opts = sorted([str(v) for v in pd.Series(df[c].dropna().unique()).astype(str)])
    ui_meta['categorical_cols'][c] = opts

meta_path = f"{ARTIFACT_DIR}/feature_types.json"
with open(meta_path, 'w') as f:
    json.dump(ui_meta, f, indent=2)
print('Saved:', meta_path)

## 13) Single Prediction Helper + Natural Language Summary

In [None]:
def predict_one(input_dict, model=best_model):
    """input_dict: mapping of original column names to values."""
    X_one = pd.DataFrame([input_dict])
    pred = model.predict(X_one)[0]
    prob = None
    if hasattr(model.named_steps['clf'], 'predict_proba'):
        probs = model.predict_proba(X_one)[0]
        classes = model.classes_
        prob = dict(zip(classes, probs))
    return pred, prob

def summarize_explanation(prob_dict):
    if prob_dict is None:
        return 'Model does not provide class probabilities.'
    top = sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)[0]
    msg = f"Predicted **{top[0]}** with confidence {top[1]*100:.1f}%. Consider improving sleep hygiene, balancing work/screen time, and regular physical activity."
    return msg

# Example with random row from test set
example = X_test.iloc[0].to_dict()
pred, prob = predict_one(example, best_model)
print('Example input:', example)
print('Prediction:', pred)
print('Probs:', prob)
print(summarize_explanation(prob))

## 14) (Optional) SHAP Explainability (Local Only)
Uncomment the cell below and install `shap` locally to view per-instance explanations. In some hosted notebooks, SHAP may be slow.

In [None]:
# %%bash
# pip install shap -q

# import shap
# explainer = None
# try:
#     # TreeExplainer works well for tree models like RandomForest/GB
#     if isinstance(best_model.named_steps['clf'], (RandomForestClassifier, GradientBoostingClassifier)):
#         # Create a reduced background to speed up
#         X_trans = best_model.named_steps['preprocess'].transform(X_train.sample(min(200, len(X_train)), random_state=RANDOM_STATE))
#         explainer = shap.TreeExplainer(best_model.named_steps['clf'])
#         shap_values = explainer.shap_values(X_trans)
#         shap.summary_plot(shap_values, X_trans, show=False)
#         plt.title('SHAP Summary (subset)')
#         plt.show()
#     else:
#         print('SHAP demo: best model is not a tree-based model; skipping.')
# except Exception as e:
#     print('SHAP failed/slow:', e)

## 15) Next Steps
- Export PDF: *File → Print → Save as PDF*.
- Commit notebook + artifacts to GitHub.
- Launch the Gradio app (below) locally to demo the model.
