In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('/kaggle/input/capstone-preprocessed/preprocessed_df_capstone.csv')

y = df['Age']

X = df.drop(columns=['Age', 'Country', 'state'])

le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])
for col in X.columns:
    if X[col].isnull().any():
        X[col].fillna(X[col].median(), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data successfully prepared for regression.")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Data successfully prepared for regression.
Training set shape: (1000, 23)
Testing set shape: (250, 23)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)


In [2]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso (L1)": Lasso(alpha=1.0, max_iter=2000),
    "Ridge (L2)": Ridge(alpha=1.0),
    "Elastic Net": ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=2000), # ✅ Added model
    "Support Vector Regressor (SVR)": SVR(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R² Score": r2_score(y_test, y_pred)
    }


results_df = pd.DataFrame(results).T
print("--- Model Performance Comparison (with Elastic Net) ---")
print(results_df)

--- Model Performance Comparison (with Elastic Net) ---
                                     MAE      RMSE  R² Score
Linear Regression               5.440932  7.055739  0.048365
Lasso (L1)                      5.596023  7.224105  0.002407
Ridge (L2)                      5.440798  7.055005  0.048563
Elastic Net                     5.571049  7.190006  0.011803
Support Vector Regressor (SVR)  5.503766  7.223146  0.002672
Random Forest Regressor         5.460840  7.104575  0.035146


In [3]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='r2', 
                           n_jobs=-1, 
                           verbose=1)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

y_pred_tuned = best_rf.predict(X_test)
print(f"Tuned R² Score: {r2_score(y_test, y_pred_tuned)}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Tuned R² Score: 0.06438477094962924


In [4]:
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

class MentalHealthRegressionPreprocessor(BaseEstimator, TransformerMixin):
    """
    A custom transformer for preprocessing the Mental Health in Tech Survey data for a regression task.
    
    Parameters:
    ----------
    target_col (str): The name of the target column to be predicted. This column, along with other
                      unnecessary columns, will be dropped from the feature set.
    """
    def __init__(self, target_col='Age'):
        self.target_col = target_col

    def fit(self, X, y=None):
        
        return self

    def transform(self, X):
        X = X.copy()

        cols_to_drop = ['Timestamp', 'state', 'Country', 'comments', self.target_col]
        X = X.drop(columns=[col for col in cols_to_drop if col in X.columns])

        binary_map = {'Yes': 1, 'No': 0}
        binary_cols = [
            'self_employed', 'family_history', 'treatment', 'remote_work', 'tech_company', 
            'benefits', 'care_options', 'wellness_program', 'seek_help', 
            'anonymity', 'mental_health_consequence', 'phys_health_consequence', 
            'mental_health_interview', 'phys_health_interview', 'obs_consequence'
        ]
        for col in binary_cols:
            if col in X.columns:
                X[col] = X[col].map(binary_map).fillna(0)

        trinary_map = {'Yes': 2, 'No': 0, "Don't know": 1, 'Some of them': 1, 'Not sure': 1, 'Maybe': 1}
        trinary_cols = ['supervisor', 'coworkers', 'mental_vs_physical']
        for col in trinary_cols:
             if col in X.columns:
                X[col] = X[col].map(trinary_map).fillna(1) # Fill NaNs with the neutral value (1)
                
        leave_map = {
            'Very difficult': 0, 'Somewhat difficult': 1, "Don't know": 2,
            'Somewhat easy': 3, 'Very easy': 4
        }
        if 'leave' in X.columns:
            X['leave'] = X['leave'].map(leave_map).fillna(2) # Fill NaNs with the neutral value (2)

        interfere_map = {
            'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3
        }
        if 'work_interfere' in X.columns:
            X['work_interfere'] = X['work_interfere'].map(interfere_map).fillna(2)

        size_map = {
            '1-5': 0, '6-25': 1, '26-100': 2,
            '100-500': 3, '500-1000': 4, 'More than 1000': 5
        }
        if 'no_employees' in X.columns:
            X['no_employees'] = X['no_employees'].map(size_map).fillna(2) # Fill NaNs with a medium size

    
        if 'Gender' in X.columns:
            def clean_gender(gender):
                if not isinstance(gender, str): 
                    return 'Other/Non-Binary'
                g = gender.lower().strip()
                if 'fem' in g or 'wom' in g or g == 'f':
                    return 'Female'
                elif 'mal' in g or 'man' in g or g == 'm' or 'guy' in g:
                    return 'Male'
                else:
                    return 'Other/Non-Binary'
            
            X['Gender'] = X['Gender'].apply(clean_gender)
            le = LabelEncoder()
            X['Gender'] = le.fit_transform(X['Gender'])

        return X

In [5]:
REGRESSION_TARGET = 'Age'

try:
    raw_df = pd.read_csv('/kaggle/input/mental-health-in-tech-survey/survey.csv')
except FileNotFoundError:
    print("Error: 'survey.csv' not found. Please download the dataset and place it in the correct directory.")
    exit()


filtered_df = raw_df[(raw_df['Age'] >= 18) & (raw_df['Age'] <= 65)].copy()
filtered_df.dropna(subset=[REGRESSION_TARGET], inplace=True)

y_final_regression = filtered_df[REGRESSION_TARGET]
    
X_final_regression = filtered_df
regression_deployment_pipeline = Pipeline(steps=[
        ('preprocessing', MentalHealthRegressionPreprocessor(target_col=REGRESSION_TARGET)),
        ('model', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
print("Training the regression model pipeline...")
regression_deployment_pipeline.fit(X_final_regression, y_final_regression)
print("Training complete.")

pipeline_filename = 'mental_health_reg_pipeline.pkl'
joblib.dump(regression_deployment_pipeline, pipeline_filename)
print(f"Regression pipeline saved to '{pipeline_filename}'")

Training the regression model pipeline...
Training complete.
Regression pipeline saved to 'mental_health_reg_pipeline.pkl'
