In [5]:
# Importing required libraries.
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# loading the dataset.
df_encoded = pd.read_csv('../data/processed/mental_health_cleaned.csv')

# Setting up features and target variable.
# Assuming 'treatment_Yes' is the target (predicting if someone sought mental health treatment).
X = df_encoded.drop(columns=['treatment_Yes'])
y = df_encoded['treatment_Yes']

# Splitting the data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scaling features for models sensitive to feature magnitude (like Logistic Regression).
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initializing models.
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Training models.
print("Training Logistic Regression...")
log_reg.fit(X_train_scaled, y_train)

print("Training Random Forest...")
rf_clf.fit(X_train, y_train)

print("Training XGBoost...")
xgb_clf.fit(X_train, y_train)

# Defining a function to evaluate models.
def evaluate_model(model, X_test, y_test, scaled=False):
    if scaled:
        preds = model.predict(X_test_scaled)
        probas = model.predict_proba(X_test_scaled)[:, 1]
    else:
        preds = model.predict(X_test)
        probas = model.predict_proba(X_test)[:, 1]

    print(confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds))
    print(f"ROC AUC Score: {roc_auc_score(y_test, probas):.4f}")
    print("-" * 60)

# Evaluating Logistic Regression.
print("Evaluation: Logistic Regression")
evaluate_model(log_reg, X_test, y_test, scaled=True)

# Evaluating Random Forest.
print("Evaluation: Random Forest")
evaluate_model(rf_clf, X_test, y_test)

# Evaluating XGBoost.
print("Evaluation: XGBoost")
evaluate_model(xgb_clf, X_test, y_test)

# Saving models using pickle.
with open('../models/logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(log_reg, f)

with open('../models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_clf, f)

with open('../models/xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_clf, f)


Training Logistic Regression...
Training Random Forest...
Training XGBoost...
Evaluation: Logistic Regression
[[96 28]
 [33 94]]
              precision    recall  f1-score   support

       False       0.74      0.77      0.76       124
        True       0.77      0.74      0.76       127

    accuracy                           0.76       251
   macro avg       0.76      0.76      0.76       251
weighted avg       0.76      0.76      0.76       251

ROC AUC Score: 0.8003
------------------------------------------------------------
Evaluation: Random Forest
[[94 30]
 [32 95]]
              precision    recall  f1-score   support

       False       0.75      0.76      0.75       124
        True       0.76      0.75      0.75       127

    accuracy                           0.75       251
   macro avg       0.75      0.75      0.75       251
weighted avg       0.75      0.75      0.75       251

ROC AUC Score: 0.8216
------------------------------------------------------------
Evalua

In [9]:
# Reinitializing and training XGBoost model.
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train, y_train)

# Re-evaluating XGBoost model.
print("Evaluation: XGBoost")
evaluate_model(xgb_clf, X_test, y_test)


Evaluation: XGBoost
[[96 28]
 [38 89]]
              precision    recall  f1-score   support

       False       0.72      0.77      0.74       124
        True       0.76      0.70      0.73       127

    accuracy                           0.74       251
   macro avg       0.74      0.74      0.74       251
weighted avg       0.74      0.74      0.74       251

ROC AUC Score: 0.8063
------------------------------------------------------------
