# Importing Dependencies

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
mental_data = pd.read_csv("Mental Health Dataset.csv")

In [3]:
mental_data.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,8/27/2014 11:29,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,8/27/2014 11:31,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,8/27/2014 11:32,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,8/27/2014 11:37,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,8/27/2014 11:43,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


In [22]:
# Drop the Timestamp column (since it's not needed for prediction)
mental_data = mental_data.drop(columns=['Timestamp'])

In [23]:
mental_data.shape

(292364, 16)

In [24]:
# Check dataset structure and missing values
mental_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292364 entries, 0 to 292363
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Gender                   292364 non-null  object
 1   Country                  292364 non-null  object
 2   Occupation               292364 non-null  object
 3   self_employed            292364 non-null  object
 4   family_history           292364 non-null  object
 5   treatment                292364 non-null  object
 6   Days_Indoors             292364 non-null  object
 7   Growing_Stress           292364 non-null  object
 8   Changes_Habits           292364 non-null  object
 9   Mental_Health_History    292364 non-null  object
 10  Mood_Swings              292364 non-null  object
 11  Coping_Struggles         292364 non-null  object
 12  Work_Interest            292364 non-null  object
 13  Social_Weakness          292364 non-null  object
 14  mental_health_interv

In [25]:
mental_data.isnull().sum()

Gender                     0
Country                    0
Occupation                 0
self_employed              0
family_history             0
treatment                  0
Days_Indoors               0
Growing_Stress             0
Changes_Habits             0
Mental_Health_History      0
Mood_Swings                0
Coping_Struggles           0
Work_Interest              0
Social_Weakness            0
mental_health_interview    0
care_options               0
dtype: int64

# Data preprocessing 

# Encoding

In [7]:
# Filling numerical missing values with mean
mental_data.fillna(mental_data.select_dtypes(include=[np.number]).mean(), inplace=True)

In [8]:
# Filling categorical missing values with mode
for col in mental_data.select_dtypes(include=['object']).columns:
    mental_data[col].fillna(mental_data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  mental_data[col].fillna(mental_data[col].mode()[0], inplace=True)


# OneHotEncoder

In [26]:
# Apply One-Hot Encoding to categorical features
mental_data_encoded = pd.get_dummies(mental_data, drop_first=True).astype(int)

In [27]:
mental_data_encoded.head()

Unnamed: 0,Gender_Male,Country_Belgium,Country_Bosnia and Herzegovina,Country_Brazil,Country_Canada,Country_Colombia,Country_Costa Rica,Country_Croatia,Country_Czech Republic,Country_Denmark,...,Mood_Swings_Medium,Coping_Struggles_Yes,Work_Interest_No,Work_Interest_Yes,Social_Weakness_No,Social_Weakness_Yes,mental_health_interview_No,mental_health_interview_Yes,care_options_Not sure,care_options_Yes
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,0,1


# Features and Label spliting 

In [28]:
print("\nColumn Names in Dataset:\n========================\n", mental_data.columns)


Column Names in Dataset:
 Index(['Gender', 'Country', 'Occupation', 'self_employed', 'family_history',
       'treatment', 'Days_Indoors', 'Growing_Stress', 'Changes_Habits',
       'Mental_Health_History', 'Mood_Swings', 'Coping_Struggles',
       'Work_Interest', 'Social_Weakness', 'mental_health_interview',
       'care_options'],
      dtype='object')


In [31]:
# Define independent variables (X) and target variable (y)
x = mental_data.drop(['Mental_Health_History'], axis=1)  # Drop the target from features
y = mental_data['Mental_Health_History']  # Define the target variable

In [33]:
# Display the shape of X and y
print("X shape:", x.shape)
print("y shape:", y.shape)

X shape: (292364, 15)
y shape: (292364,)


In [34]:
x

Unnamed: 0,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,Female,United States,Corporate,No,No,Yes,1-14 days,Yes,No,Medium,No,No,Yes,No,Not sure
1,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Medium,No,No,Yes,No,No
2,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Medium,No,No,Yes,No,Yes
3,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Medium,No,No,Yes,Maybe,Yes
4,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Medium,No,No,Yes,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292359,Male,United States,Business,Yes,Yes,Yes,15-30 days,No,Maybe,Low,Yes,No,Maybe,Maybe,Not sure
292360,Male,South Africa,Business,No,Yes,Yes,15-30 days,No,Maybe,Low,Yes,No,Maybe,No,Yes
292361,Male,United States,Business,No,Yes,No,15-30 days,No,Maybe,Low,Yes,No,Maybe,No,No
292362,Male,United States,Business,No,Yes,Yes,15-30 days,No,Maybe,Low,Yes,No,Maybe,No,Yes


In [15]:
y

0         Not sure
1               No
2              Yes
3              Yes
4              Yes
            ...   
292359    Not sure
292360         Yes
292361          No
292362         Yes
292363         Yes
Name: care_options, Length: 292364, dtype: object

# Encoding 

In [39]:
x = pd.get_dummies(x, drop_first=True).astype(int)

In [40]:
le = LabelEncoder()
y = le.fit_transform(y)

In [41]:
x

Unnamed: 0,Gender_Male,Country_Belgium,Country_Bosnia and Herzegovina,Country_Brazil,Country_Canada,Country_Colombia,Country_Costa Rica,Country_Croatia,Country_Czech Republic,Country_Denmark,...,Mood_Swings_Medium,Coping_Struggles_Yes,Work_Interest_No,Work_Interest_Yes,Social_Weakness_No,Social_Weakness_Yes,mental_health_interview_No,mental_health_interview_Yes,care_options_Not sure,care_options_Yes
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292359,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,0
292360,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,1
292361,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0
292362,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,1


In [42]:
y

array([2, 2, 2, ..., 1, 1, 1], shape=(292364,))

# Train Test Split

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

In [44]:
# Display shapes of training and testing datasets
print("Training Features Shape:", x_train.shape)  
print("Testing Features Shape:", x_test.shape)  

print("Training Labels Shape:", y_train.shape)  
print("Testing Labels Shape:", y_test.shape)

Training Features Shape: (233891, 61)
Testing Features Shape: (58473, 61)
Training Labels Shape: (233891,)
Testing Labels Shape: (58473,)


# Feature Scaling (Only for SVR)

In [45]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(x_train)
X_test_scaled = ss.transform(x_test)

# Model Training 

## Support Vector Regression

In [None]:
svr_reg = SVR(kernel='rbf')
svr_reg.fit(X_train_scaled, y_train)

## Random Forest Regression Model

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)  # No scaling needed for Random Forest

# Predictions

In [None]:
y_pred_svr = svr_reg.predict(X_test_scaled)
y_pred_rf = rf_reg.predict(X_test)

#  Model Evaluation Function

In [None]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"\nModel: {model_name}")
    print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred):.2f}")
    print(f"Root Mean Squared Error (RMSE): {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
    print(f"R² Score: {r2_score(y_test, y_pred):.2f}")

# Evaluate models
evaluate_model(y_test, y_pred_svr, "Support Vector Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest Regression")


# Visualization - Actual vs Predicted Recovery Time

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred_rf, color='blue', label='Predicted Recovery Time', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='dashed', label='Perfect Fit')
plt.xlabel("Actual Recovery Time")
plt.ylabel("Predicted Recovery Time")
plt.title("Actual vs Predicted Mental Health Recovery Time (Random Forest)")
plt.legend()
plt.show()


# Save Model

In [None]:
import joblib
filename = "LR Mail Spam Prediction.pkl"
# save 
joblib.dump(lr_model, filename)

# Load Model

In [None]:
loaded_model = joblib.load(filename)

print("Model loaded successfully!")