In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(r"C:\Users\khksu\Downloads\Mental Health Screening System\data.csv", sep=r'\t', engine='python')

In [None]:
df.shape

In [None]:
df.head(2)

In [None]:
df.tail(2)

In [None]:
df.isnull().sum()

In [None]:
feature_cols = [f"Q{i}A" for i in range(1, 43)]
df_cleaned = df[feature_cols].copy()
df_cleaned.head()

In [None]:
dep_items = ["Q3A","Q5A","Q10A","Q13A","Q16A","Q17A","Q21A","Q24A","Q26A","Q31A","Q34A","Q37A","Q38A","Q42A"]
anx_items = ["Q2A","Q4A","Q7A","Q9A","Q15A","Q19A","Q20A","Q23A","Q25A","Q28A","Q30A","Q36A","Q40A","Q41A"]
str_items = ["Q1A","Q6A","Q8A","Q11A","Q12A","Q14A","Q18A","Q22A","Q27A","Q29A","Q32A","Q33A","Q35A","Q39A"]

In [None]:
df_cleaned["Depression_Score"] = df_cleaned[dep_items].sum(axis=1)
df_cleaned["Anxiety_Score"]    = df_cleaned[anx_items].sum(axis=1)
df_cleaned["Stress_Score"]     = df_cleaned[str_items].sum(axis=1)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
scores_df = df_cleaned[["Depression_Score","Anxiety_Score","Stress_Score"]]
corr_target = np.zeros((len(feature_cols), 3), dtype=float)
for i, q in enumerate(feature_cols):
    corr_target[i, 0] = df_cleaned[q].corr(scores_df["Depression_Score"], method='spearman')
    corr_target[i, 1] = df_cleaned[q].corr(scores_df["Anxiety_Score"],    method='spearman')
    corr_target[i, 2] = df_cleaned[q].corr(scores_df["Stress_Score"],     method='spearman')

plt.figure(figsize=(6.5, 10))
im = plt.imshow(corr_target, aspect='auto', vmin=-1, vmax=1)
plt.yticks(range(len(feature_cols)), feature_cols, fontsize=7)
plt.xticks([0,1,2], ["Depression_Score","Anxiety_Score","Stress_Score"], fontsize=9)
plt.title("Question vs Target Score (Spearman ρ)", pad=8)
cbar = plt.colorbar(im); cbar.set_label("ρ")
plt.tight_layout(); plt.show()

In [None]:
def categorize(score):
    if score <= 14: return 0    # Normal
    elif score <= 28: return 1  # Moderate
    else: return 2              # Severe

df_cleaned["Depression_Label"] = df_cleaned["Depression_Score"].apply(categorize)
df_cleaned["Anxiety_Label"]    = df_cleaned["Anxiety_Score"].apply(categorize)
df_cleaned["Stress_Label"]     = df_cleaned["Stress_Score"].apply(categorize)

In [None]:
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, classification_report, precision_recall_fscore_support, ConfusionMatrixDisplay)

In [None]:
# This coding part only for model comparison
def compare_models(label_name):
    X = df_cleaned[feature_cols]
    y = df_cleaned[label_name]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    models = {
        "Logistic Regression": {
            "est": LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs'),
            "Xtr": X_train_scaled, "Xte": X_test_scaled
        },
        "Random Forest": {
            "est": RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42),
            "Xtr": X_train.values, "Xte": X_test.values
        },
        "XGBoost": {
            "est": XGBClassifier(
                objective='multi:softprob',
                num_class=len(np.unique(y_train)),
                eval_metric='mlogloss',
                random_state=42
            ),
            "Xtr": X_train.values, "Xte": X_test.values
        }
    }

    rows = []
    preds_cache = {}

    print(f"\n=== Model comparison for {label_name.replace('_Label','')} ===")
    for name, cfg in models.items():
        model = cfg["est"]; Xtr, Xte = cfg["Xtr"], cfg["Xte"]
        model.fit(Xtr, y_train)
        y_pred = model.predict(Xte)
        preds_cache[name] = y_pred

        acc = accuracy_score(y_test, y_pred)
        p, r, f, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
        rows.append((name, acc, p, r, f))

        print(f"\n--- {name} ---")
        print("Accuracy:", round(acc, 4))
        print(classification_report(y_test, y_pred))

        fig, ax = plt.subplots(figsize=(6, 4))
        ConfusionMatrixDisplay.from_estimator(model, Xte, y_test, ax=ax)
        ax.set_title(f"{label_name.replace('_Label','')} - {name} Confusion Matrix", pad=8)
        fig.tight_layout(); plt.show()

        #display(HTML("<div style='height:12px'></div>"))

    results_df = pd.DataFrame(rows, columns=["Model","Accuracy","Precision","Recall","F1"]).set_index("Model")
    print("\nSummary (macro averages):")
    display(results_df)

    for metric in ["Accuracy","Precision","Recall","F1"]:
        fig, ax = plt.subplots(figsize=(6, 4))
        vals = results_df[metric].values
        ax.bar(results_df.index, vals)
        ax.set_ylim(0, 1.1)
        ax.set_ylabel(metric)
        ax.set_title(f"{label_name.replace('_Label','')} – {metric} by Model", pad=12)
        for i, v in enumerate(vals):
            ax.text(i, v + 0.01, f"{v:.2f}", ha="center")
        fig.tight_layout(); plt.show()

    lr_pred = preds_cache["Logistic Regression"]
    classes = [0, 1, 2]
    true_counts = [np.sum(y_test.values == c) for c in classes]
    pred_counts = [np.sum(lr_pred == c) for c in classes]
    x = np.arange(len(classes)); w = 0.35

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.bar(x - w/2, true_counts, w, label="True")
    ax.bar(x + w/2, pred_counts, w, label="Predicted")
    ax.set_xticks(x); ax.set_xticklabels([str(c) for c in classes])
    ax.set_xlabel("Class"); ax.set_ylabel("Count")
    ax.set_title(label_name.replace('_Label','') + " – Predicted vs True (Logistic Regression)", pad=8)
    ax.legend()
    fig.tight_layout(); plt.show()

    best_by_acc = results_df["Accuracy"].idxmax()
    print(f"\nBest by Accuracy for {label_name.replace('_Label','')}: {best_by_acc}")
    return results_df

In [None]:
cmp_dep = compare_models("Depression_Label")
cmp_anx = compare_models("Anxiety_Label")
cmp_str = compare_models("Stress_Label")

In [None]:
import json
import pickle

In [None]:
#This code part for actuall model training and saving
def train_and_save_model(label_name, model_filename, scaler_filename):
    X = df_cleaned[feature_cols]
    y = df_cleaned[label_name]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs')
    model.fit(X_train_scaled, y_train)

    with open(model_filename, "wb") as f:
        pickle.dump(model, f)
    with open(scaler_filename, "wb") as f:
        pickle.dump(scaler, f)

train_and_save_model("Depression_Label", "depression_model.pkl", "depression_scaler.pkl")
train_and_save_model("Anxiety_Label",    "anxiety_model.pkl",    "anxiety_scaler.pkl")
train_and_save_model("Stress_Label",     "stress_model.pkl",     "stress_scaler.pkl")

In [None]:
model_columns = feature_cols
with open("model_columns.json", "w", encoding="utf-8") as f:
    json.dump({"columns": model_columns}, f, indent=4)

In [None]:
# === Save Proper DASS-42 Questions ===
dass42_questions = [
    "I found myself getting upset by quite trivial things.",  # Q1
    "I was aware of dryness of my mouth.",  # Q2
    "I couldn't seem to experience any positive feeling at all.",  # Q3
    "I experienced breathing difficulty (eg, excessively rapid breathing, breathlessness in the absence of physical exertion).",  # Q4
    "I just couldn't seem to get going.",  # Q5
    "I tended to over-react to situations.",  # Q6
    "I had a feeling of shakiness (eg, legs going to give way).",  # Q7
    "I found it difficult to relax.",  # Q8
    "I found myself in situations that made me so anxious I was most relieved when they ended.",  # Q9
    "I felt that I had nothing to look forward to.",  # Q10
    "I found myself getting upset rather easily.",  # Q11
    "I felt that I was using a lot of nervous energy.",  # Q12
    "I felt sad and depressed.",  # Q13
    "I found myself getting impatient when I was delayed in any way (eg, elevators, traffic lights, being kept waiting).",  # Q14
    "I had a feeling of faintness.",  # Q15
    "I felt that I had lost interest in just about everything.",  # Q16
    "I felt I wasn't worth much as a person.",  # Q17
    "I felt that I was rather touchy.",  # Q18
    "I perspired noticeably (eg, hands sweaty) in the absence of high temperatures or physical exertion.",  # Q19
    "I felt scared without any good reason.",  # Q20
    "I felt that life wasn't worthwhile.",  # Q21
    "I found it hard to wind down.",  # Q22
    "I had difficulty in swallowing.",  # Q23
    "I couldn't seem to get any enjoyment out of the things I did.",  # Q24
    "I was aware of the action of my heart in the absence of physical exertion (eg, sense of heart rate increase, heart missing a beat).",  # Q25
    "I felt down-hearted and blue.",  # Q26
    "I found that I was very irritable.",  # Q27
    "I felt I was close to panic.",  # Q28
    "I found it hard to calm down after something upset me.",  # Q29
    "I feared that I would be \"thrown\" by some trivial but unfamiliar task.",  # Q30
    "I was unable to become enthusiastic about anything.",  # Q31
    "I found it difficult to tolerate interruptions to what I was doing.",  # Q32
    "I was in a state of nervous tension.",  # Q33
    "I felt I was pretty worthless.",  # Q34
    "I was intolerant of anything that kept me from getting on with what I was doing.",  # Q35
    "I felt terrified.",  # Q36
    "I could see nothing in the future to be hopeful about.",  # Q37
    "I felt that life was meaningless.",  # Q38
    "I found myself getting agitated.",  # Q39
    "I was worried about situations in which I might panic and make a fool of myself.",  # Q40
    "I experienced trembling (eg, in the hands).",  # Q41
    "I found it difficult to work up the initiative to do things."  # Q42
]

In [None]:
qmap = {f"Q{i+1}A": q for i, q in enumerate(dass42_questions)}
with open("dass42_questions.json", "w", encoding="utf-8") as f:
    json.dump(qmap, f, indent=4)

print("\nModels, scalers, SHAP backgrounds, columns, and questions saved.")

In [None]:
#SHAP Plots
log_path = "shap_logs.csv"
log = pd.read_csv(log_path)

# 1) Global importance (across all predictions & conditions)
global_imp = (
    log.groupby("feature")["shap_value"]
       .apply(lambda s: np.mean(np.abs(s)))
       .sort_values(ascending=False)
)

plt.figure(figsize=(7,5))
global_imp.head(20)[::-1].plot(kind="barh")
plt.title("Global SHAP importance (mean |SHAP| across all predictions)")
plt.xlabel("Mean |SHAP|")
plt.tight_layout(); plt.show()

# 2) Per-condition importance
for cond in ["Depression", "Anxiety", "Stress"]:
    cond_imp = (
        log.loc[log["condition"]==cond]
           .groupby("feature")["shap_value"]
           .apply(lambda s: np.mean(np.abs(s)))
           .sort_values(ascending=False)
    )
    if cond_imp.empty: 
        continue
    plt.figure(figsize=(7,5))
    cond_imp.head(15)[::-1].plot(kind="barh")
    plt.title(f"{cond} – SHAP importance (mean |SHAP|)")
    plt.xlabel("Mean |SHAP|")
    plt.tight_layout(); plt.show()

# 3) Per-class importance (Normal/Moderate/Severe)
for c in [0,1,2]:
    class_imp = (
        log.loc[log["predicted_class"]==c]
           .groupby("feature")["shap_value"]
           .apply(lambda s: np.mean(np.abs(s)))
           .sort_values(ascending=False)
    )
    if class_imp.empty:
        continue
    plt.figure(figsize=(7,5))
    class_imp.head(15)[::-1].plot(kind="barh")
    plt.title(f"All conditions – class {c} – SHAP importance (mean |SHAP|)")
    plt.xlabel("Mean |SHAP|")
    plt.tight_layout(); plt.show()

# 4) One prediction example (Top-5) – last prediction in the log
example_ts = log["timestamp"].iloc[-1]
ex = (log[log["timestamp"]==example_ts]
      .sort_values(by="shap_value", key=lambda s: np.abs(s), ascending=False)
      .head(5))

plt.figure(figsize=(6,4))
plt.barh(range(len(ex)), ex["shap_value"].values)
plt.yticks(range(len(ex)), ex["feature"].values, fontsize=9)
plt.title(f"Top-5 SHAP (example at {example_ts})")
plt.xlabel("SHAP value")
plt.tight_layout(); plt.show()