# Mental Health in Tech Survey Analysis
Analysis of factors influencing mental health treatment seeking in tech.

## Setup
Import required libraries and load data

In [9]:
# Core libraries
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance

# MLflow imports
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import (
    make_scorer, accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, 
    precision_recall_curve, roc_curve, auc
)

# Load data
df = pd.read_csv("../data/raw/survey.csv")
print(f"Dataset shape: {df.shape}")

# MLflow setup
mlflow.set_tracking_uri("http://localhost:5000")  # Adjust if using different URI
mlflow.set_experiment("mental-health-tech-prediction")

Dataset shape: (1259, 27)


<Experiment: artifact_location='mlflow-artifacts:/364658969276453932', creation_time=1762713856646, experiment_id='364658969276453932', last_update_time=1762713856646, lifecycle_stage='active', name='mental-health-tech-prediction', tags={'mlflow.experimentKind': 'custom_model_development'}>

## Section 2: Initial Feature Engineering
1. Clean and prepare features
2. Handle gender categories
3. Split features and target

In [10]:
# Define target and initial feature selection
target_col = "treatment"
features_to_drop = ['Timestamp', 'Country', 'state', 'comments', target_col]

# Prepare target (binary: Yes/No -> 1/0)
y = df[target_col].map({"Yes": 1, "No": 0}).astype(int)
X = df.drop(columns=features_to_drop)

# Helper function for gender cleaning
def clean_gender(gen):
    s = str(gen).strip().lower()
    s = re.sub(r"[\W_]+", " ", s).strip()
    
    if s in {"m", "male", "man", "make", "mal", "malr", "msle", "masc", "mail", "boy"}:
        return "Male"
    if s in {"f", "female", "woman", "femake", "femail", "femme", "girl"}:
        return "Female"
    return "Other"

# Apply gender cleaning
X['Gender'] = X['Gender'].apply(clean_gender)

# Separate features by type
numeric_features = X.select_dtypes(include=['number', 'bool']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
other_categoricals = [c for c in categorical_features if c != 'Gender']

print("Feature types:")
print(f"Numeric: {len(numeric_features)}")
print(f"Categorical: {len(categorical_features)}")

Feature types:
Numeric: 1
Categorical: 21


## Section 3: Preprocessing Pipeline Setup
Define preprocessing steps for different feature types

In [11]:
# Pipeline components
numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine into preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='drop')

# Model definitions
models = {
    'LogisticRegression': (
        LogisticRegression(solver='liblinear', max_iter=5000),
        {
            'C': np.logspace(-3, 2, 20),
            'penalty': ['l1', 'l2']
        }
    ),
    'RandomForest': (
        RandomForestClassifier(random_state=42),
        {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5],
            'class_weight': [None, 'balanced']
        }
    ),
    'XGBoost': (
        XGBClassifier(random_state=42),
        {
            'n_estimators': [200, 300, 400],
            'max_depth': [3, 4, 5, 6],
            'learning_rate': [0.01, 0.05, 0.1],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
    )
}

## Section 4: Model Training & Initial Evaluation
Train models with cross-validation and hyperparameter tuning

In [12]:
def train_evaluate_model(X, y, model_name, model_tuple, cv=5, n_iter=20):
    """Train a model. If param grid is empty, fit pipeline and log CV metrics;
    otherwise run RandomizedSearchCV. Returns fitted estimator or search object."""
    model, param_grid = model_tuple
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('model', model)
    ])

    # prefix params for pipeline if provided
    param_grid_prefixed = {f'model__{k}': v for k, v in (param_grid or {}).items()}

    scoring = {
        'accuracy': 'accuracy',
        'precision': make_scorer(precision_score, zero_division=0),
        'recall': make_scorer(recall_score, zero_division=0),
        'f1': make_scorer(f1_score, zero_division=0),
        'roc_auc': 'roc_auc'
    }

    run_name = f"cv_search_{model_name}"
    with mlflow.start_run(run_name=run_name):
        mlflow.log_param('dataset_shape', X.shape)
        mlflow.log_param('cv_folds', cv)

        # If no hyperparameter grid provided, do a single fit + CV evaluation
        if not param_grid_prefixed:
            # Fit pipeline on full data
            pipeline.fit(X, y)
            # Evaluate with cross-val (cv) for metrics to log
            cv_obj = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
            metrics = {}
            for metric_name, scorer in [('roc_auc','roc_auc'), ('accuracy','accuracy')]:
                try:
                    scores = []
                    if metric_name == 'roc_auc':
                        # use cross_val_score with scoring string
                        scores = pd.Series(np.mean(pd.np.zeros(1))) if False else None
                    # fallback to cross_val_score for available scoring strings
                    from sklearn.model_selection import cross_val_score
                    sc = cross_val_score(pipeline, X, y, cv=cv_obj, scoring=scorer, n_jobs=-1)
                    metrics[f'cv_mean_{metric_name}'] = float(np.mean(sc))
                except Exception:
                    # skip metric if CV scoring failed
                    metrics[f'cv_mean_{metric_name}'] = np.nan

            # Log metrics and model
            mlflow.log_metrics(metrics)
            mlflow.sklearn.log_model(pipeline, f'model_{model_name}')
            return pipeline

        # Otherwise perform randomized search
        search = RandomizedSearchCV(
            pipeline, param_grid_prefixed, n_iter=n_iter, cv=cv,
            scoring=scoring, refit='roc_auc',
            n_jobs=-1, random_state=42, verbose=1
        )
        search.fit(X, y)

        # Log best parameters and metrics (guarded)
        try:
            mlflow.log_params({f'best_{k}': v for k, v in search.best_params_.items()})
        except Exception:
            pass

        try:
            bi = search.best_index_
            mlflow.log_metric('best_roc_auc', float(search.cv_results_['mean_test_roc_auc'][bi]))
            mlflow.log_metric('best_f1', float(search.cv_results_['mean_test_f1'][bi]))
            mlflow.log_metric('best_precision', float(search.cv_results_['mean_test_precision'][bi]))
            mlflow.log_metric('best_recall', float(search.cv_results_['mean_test_recall'][bi]))
        except Exception:
            # if cv_results_ keys are missing, skip logging
            pass

        # Log best model
        try:
            mlflow.sklearn.log_model(search.best_estimator_, f'model_{model_name}')
        except Exception:
            pass

        return search

# Train and evaluate all models
results = []
trained_models = {}

for name, model_tuple in models.items():
    print(f"\nTraining {name}...")
    search = train_evaluate_model(X, y, name, model_tuple)
    trained_models[name] = search
    
    results.append({
        'model': name,
        'roc_auc': search.cv_results_['mean_test_roc_auc'][search.best_index_],
        'f1': search.cv_results_['mean_test_f1'][search.best_index_],
        'precision': search.cv_results_['mean_test_precision'][search.best_index_],
        'recall': search.cv_results_['mean_test_recall'][search.best_index_]
    })

# Show results
results_df = pd.DataFrame(results).sort_values('roc_auc', ascending=False)
print("\nModel Comparison:")
print(results_df)


Training LogisticRegression...
Fitting 5 folds for each of 20 candidates, totalling 100 fits




üèÉ View run cv_search_LogisticRegression at: http://localhost:5000/#/experiments/364658969276453932/runs/2bcc9aeee5ba432fb003274289784a4b
üß™ View experiment at: http://localhost:5000/#/experiments/364658969276453932

Training RandomForest...
Fitting 5 folds for each of 20 candidates, totalling 100 fits




üèÉ View run cv_search_RandomForest at: http://localhost:5000/#/experiments/364658969276453932/runs/ce827944c0aa48feb052ce9705c6cbe7
üß™ View experiment at: http://localhost:5000/#/experiments/364658969276453932

Training XGBoost...
Fitting 5 folds for each of 20 candidates, totalling 100 fits




üèÉ View run cv_search_XGBoost at: http://localhost:5000/#/experiments/364658969276453932/runs/8019da6dbe06453abc2e78eb276e8953
üß™ View experiment at: http://localhost:5000/#/experiments/364658969276453932

Model Comparison:
                model   roc_auc        f1  precision    recall
2             XGBoost  0.821815  0.752776   0.749365  0.758268
1        RandomForest  0.815084  0.760413   0.757337  0.766129
0  LogisticRegression  0.811529  0.755670   0.748625  0.764505


## Section 5: Feature Selection & Final Model
Identify and remove noisy features, then retrain best model

In [13]:
def identify_noisy_features(X, y, threshold=0.01):
    """Identify low-importance features using permutation importance"""
    try:
        # Get best model from initial evaluation
        best_model_name = results_df.iloc[0]['model']
        best_pipeline = trained_models[best_model_name].best_estimator_
        print(f"Using {best_model_name} for feature importance analysis")

        # Split data
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Get preprocessor and model
        preprocess = best_pipeline.named_steps['preprocess']
        model = best_pipeline.named_steps['model']

        # Fit preprocessor and transform
        preprocess.fit(X_train)
        X_train_processed = preprocess.transform(X_train)
        X_val_processed = preprocess.transform(X_val)

        # Fit model on processed data
        model.fit(X_train_processed, y_train)

        # Compute permutation importance on processed validation data
        result = permutation_importance(
            model, X_val_processed, y_val, n_repeats=10, random_state=42, n_jobs=-1
        )

        # Attempt to get feature names from the preprocessor
        try:
            feature_names = preprocess.get_feature_names_out()
        except Exception:
            feature_names = [f'feature_{i}' for i in range(X_val_processed.shape[1])]

        # Map importance back to original columns by matching substrings
        original_importance = {}
        for col in X.columns:
            col_imp = []
            for idx, feat in enumerate(feature_names):
                if col.lower() in feat.lower() or col.lower().replace(' ', '_') in feat.lower():
                    col_imp.append(result.importances_mean[idx])
            original_importance[col] = float(max(col_imp)) if col_imp else 0.0

        importances = pd.Series(original_importance)

        # Save plot artifact
        import os
        os.makedirs('mlruns', exist_ok=True)
        plt.figure(figsize=(12, 8))
        importances.sort_values().plot(kind='barh')
        plt.title('Feature Importance by Original Column')
        plt.xlabel('Mean Importance (Permutation)')
        plt.tight_layout()
        plt.savefig('mlruns/feature_importance.png')
        plt.close()

        noisy_cols = importances[importances < threshold].index.tolist()
        return noisy_cols, importances

    except Exception as e:
        print(f"Error in feature importance calculation: {str(e)}")
        import traceback
        traceback.print_exc()
        return [], None

# Run final model training with MLflow tracking (use best params, do NOT re-run RandomizedSearchCV)
with mlflow.start_run(run_name='final_model') as run:
    print('Starting feature selection process...')
    noisy_features, importances = identify_noisy_features(X, y, threshold=0.001)

    if noisy_features:
        print(f"\nRemoving {len(noisy_features)} noisy features...")
        X_clean = X.drop(columns=noisy_features)
        mlflow.log_param('removed_features', str(noisy_features))
    else:
        print('\nNo features to remove.')
        X_clean = X.copy()
        mlflow.log_param('removed_features', 'none')

    mlflow.log_param('final_feature_count', X_clean.shape[1])

    if importances is not None:
        mlflow.log_artifact('mlruns/feature_importance.png')

    # Rebuild preprocessor for cleaned data
    numeric_features = X_clean.select_dtypes(include=['number', 'bool']).columns.tolist()
    categorical_features = X_clean.select_dtypes(include=['object', 'category']).columns.tolist()
    preprocessor = ColumnTransformer([
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ], remainder='drop')

    # Build final estimator using best hyperparameters from earlier search (do not run CV again)
    best_model_name = results_df.iloc[0]['model']
    best_search = trained_models[best_model_name]

    # Extract model-level params (keys like 'model__param') and construct a fresh estimator
    best_params = {}
    if hasattr(best_search, 'best_params_') and best_search.best_params_:
        # remove 'model__' prefix
        best_params = {k.replace('model__', ''): v for k, v in best_search.best_params_.items()}

    base_estimator = models[best_model_name][0]  # an instance from models dict
    EstimatorClass = base_estimator.__class__
    try:
        final_estimator = EstimatorClass(**best_params) if best_params else EstimatorClass()
    except Exception as e:
        print('Warning: could not instantiate estimator with best_params, falling back to default. Error:', e)
        final_estimator = EstimatorClass()

    final_pipeline = Pipeline([('preprocess', preprocessor), ('model', final_estimator)])
    # Fit final pipeline on cleaned full data
    final_pipeline.fit(X_clean, y)
    # Log final pipeline as model artifact
    try:
        mlflow.sklearn.log_model(final_pipeline, f'final_model_{best_model_name}')
    except Exception as e:
        print('Could not log final model to MLflow:', e)

    # Final evaluation on holdout split
    X_tr, X_te, y_tr, y_te = train_test_split(X_clean, y, test_size=0.2, stratify=y, random_state=42)
    y_pred = final_pipeline.predict(X_te)
    # try predict_proba, fallback to decision_function scaled to [0,1]
    if hasattr(final_pipeline, 'predict_proba'):
        y_prob = final_pipeline.predict_proba(X_te)[:, 1]
    else:
        try:
            scores = final_pipeline.decision_function(X_te)
            y_prob = (scores - scores.min()) / (scores.max() - scores.min() + 1e-12)
        except Exception:
            y_prob = np.zeros(len(y_te))

    # Log final metrics
    mlflow.log_metrics({
        'final_accuracy': float(accuracy_score(y_te, y_pred)),
        'final_precision': float(precision_score(y_te, y_pred, zero_division=0)),
        'final_recall': float(recall_score(y_te, y_pred, zero_division=0)),
        'final_f1': float(f1_score(y_te, y_pred, zero_division=0)),
        'final_roc_auc': float(roc_auc_score(y_te, y_prob)) if y_prob.sum() else float('nan')
    })

    # Save and log ROC curve
    plt.figure(figsize=(8, 6))
    try:
        fpr, tpr, _ = roc_curve(y_te, y_prob)
        plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc_score(y_te, y_prob):.3f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve - Final Model')
        plt.legend()
        plt.savefig('roc_curve.png')
        plt.close()
        mlflow.log_artifact('roc_curve.png')
    except Exception as e:
        print('Could not create/log ROC curve:', e)

    # Persist feature selection results
    try:
        with open('feature_selection_results.txt', 'w', encoding='utf-8') as fh:
            fh.write('Removed features:\n')
            for f in noisy_features:
                fh.write(f'- {f}\n')
            fh.write('\nRetained features:\n')
            for f in X_clean.columns:
                fh.write(f'- {f}\n')
        mlflow.log_artifact('feature_selection_results.txt')
    except Exception as e:
        print('Could not write/log feature_selection_results.txt:', e)

print('\nExperiment tracking completed. Check MLflow UI for detailed results.')

Starting feature selection process...
Using XGBoost for feature importance analysis

Removing 3 noisy features...




üèÉ View run final_model at: http://localhost:5000/#/experiments/364658969276453932/runs/ccce6cbda4b140e682b3b25febd2e46a
üß™ View experiment at: http://localhost:5000/#/experiments/364658969276453932

Experiment tracking completed. Check MLflow UI for detailed results.


In [19]:
# --- Inspect final model hyperparameters (XGBoost / scikit-learn) ---
from pprint import pprint

def _strip_model_prefix(params):
    """Remove 'model__' prefixes coming from Grid/RandomizedSearch inside a Pipeline."""
    return {k.replace('model__', ''): v for k, v in params.items()}

print("\n=== Final Model Hyperparameters Report ===")

# 1) Identify the best model name used throughout the script
best_model_name = None
try:
    best_model_name = results_df.iloc[0]['model']
    print(f"Best model (per earlier search): {best_model_name}")
except Exception as e:
    print("Could not determine best model name from results_df:", e)

# 2) Extract best search object and its tuned params (from the earlier CV)
best_params_from_search = {}
try:
    if best_model_name is not None and 'trained_models' in globals():
        best_search = trained_models[best_model_name]
        if hasattr(best_search, 'best_params_') and best_search.best_params_:
            best_params_from_search = _strip_model_prefix(best_search.best_params_)
            print("\n> Best params from earlier search (model-level):")
            pprint(best_params_from_search)
        else:
            print("\n> No best_params_ found on the stored search object.")
    else:
        print("\n> Could not access trained_models[best_model_name].")
except Exception as e:
    print("\nError while reading best_params_ from search object:", e)

# 3) Read the actual estimator that ended up inside the final pipeline
final_estimator = None
try:
    if 'final_pipeline' in globals():
        final_estimator = final_pipeline.named_steps['model']
        print(f"\nFinal estimator class: {type(final_estimator).__name__}")
    else:
        print("\nNo 'final_pipeline' found in memory.")
except Exception as e:
    print("\nError accessing final_pipeline.named_steps['model']:", e)

# 4) Print the final estimator's effective hyperparameters (authoritative)
try:
    if final_estimator is not None and hasattr(final_estimator, "get_params"):
        final_params = final_estimator.get_params(deep=False)
        print("\n> Hyperparameters actually set on the FINAL estimator (authoritative):")
        pprint(final_params)
    else:
        print("\nFinal estimator not available to read params from.")
except Exception as e:
    print("\nError reading params from final estimator:", e)

# 5) Optional: show differences between CV best params and final estimator params
try:
    if final_estimator is not None and best_params_from_search:
        final_params = final_estimator.get_params(deep=False)
        diffs = {k: (best_params_from_search.get(k, "<missing_in_CV>"), final_params.get(k, "<missing_in_final>"))
                 for k in set(best_params_from_search) | set(final_params)}
        changed = {k: v for k, v in diffs.items() if v[0] != v[1]}
        if changed:
            print("\n> Differences between CV best params and final estimator params (CV -> FINAL):")
            pprint(changed)
        else:
            print("\n> CV best params match the final estimator params.")
except Exception as e:
    print("\nError computing CV vs FINAL parameter differences:", e)

print("\n=== End of Report ===\n")



=== Final Model Hyperparameters Report ===
Best model (per earlier search): XGBoost

> Best params from earlier search (model-level):
{'colsample_bytree': 1.0,
 'learning_rate': 0.01,
 'max_depth': 6,
 'n_estimators': 400,
 'subsample': 0.8}

Final estimator class: XGBClassifier

> Hyperparameters actually set on the FINAL estimator (authoritative):
{'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 1.0,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'feature_weights': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.01,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 6,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': Non