In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

In [None]:
# === Load Dataset ===
df = pd.read_csv("data/mental_health_lite.csv")

In [None]:
# === Encode Categorical Columns ===
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
label_encoders = {}
df_encoded = df.copy()

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

In [None]:
# === Split Data ===
target_column = "mental_health_risk"
X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]
y_class_names = label_encoders[target_column].classes_

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# === Train Models ===
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
}

results = {}
for name, model in models.items():
    pipeline = Pipeline([("model", model)])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")

    results[name] = {
        "model": pipeline,
        "accuracy": acc,
        "f1_score": f1,
        "y_pred": y_pred
    }


In [None]:
# === Select Best Model ===
best_model_name = max(results, key=lambda m: results[m]["f1_score"])
best_result = results[best_model_name]

In [None]:
# === Save Best Model ===
joblib.dump(best_result["model"], "model/final_model.pkl")

In [None]:
# === Save Metrics ===
with open("Results/metrics.txt", "w") as f:
    f.write(f"Model: {best_model_name}\n")
    f.write(f"Accuracy: {best_result['accuracy']:.4f}\n")
    f.write(f"F1 Score: {best_result['f1_score']:.4f}\n")

In [None]:
# === Save Confusion Matrix ===
cm = confusion_matrix(y_test, best_result["y_pred"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_class_names)
disp.plot()
plt.title(f"Confusion Matrix - {best_model_name}")
plt.savefig("Results/confusion_matrix.png", dpi=120)
plt.close()

print(f"Training complete. Best model: {best_model_name}")