In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import gender_guesser.detector as gender
from joblib import dump, load

In [2]:
df=pd.read_csv('depression_data.csv')

In [3]:
# encode the categorical variables to numeric values so that correlations can be searched for
label_encoder = LabelEncoder()

# Dictionary to store the original labels for each column
original_labels = {}

# custom mappings for the non-binary categorical variables, makes sure they're in a sensible (e.g., ascending) order
category_mappings = {
    'Education Level': {
        'High School': 0,
        'Associate Degree': 1,
        "Bachelor's Degree": 2,
        "Master's Degree": 3,
        'PhD': 4
    },
    'Smoking Status': {
        'Non-smoker': 0,
        'Former': 1,
        'Current': 2
    },
    'Physical Activity Level': {
        'Sedentary': 2,
        'Moderate': 1,
        'Active': 0
    },
    'Alcohol Consumption': {
        'High': 2,
        'Moderate': 1,
        'Low': 0
    },
    'Dietary Habits': {
        'Healthy': 0,
        'Moderate': 1,
        'Unhealthy': 2
    },
    'Sleep Patterns': {
        'Good': 0,
        'Fair': 1,
        'Poor': 2
    }
}

# copy the dataframe for safety
df_encoded = df.copy()
df_encoded = df_encoded.drop(columns=['Name'])  # name is not useful so drop it

# apply the custom mappings to the relevant categorical column
for col, mapping in category_mappings.items():
    df_encoded[col] = df_encoded[col].map(mapping)

# binary categorical variables are not included in the mappings
for col in df_encoded.columns:
    if col not in category_mappings:
        df_encoded[col] = label_encoder.fit_transform(df_encoded[col].astype(str))

In [4]:
df_numeric = df_encoded.select_dtypes(include=['number'])

In [5]:
# build the model to predict history of mental illness

X = df_numeric.drop(columns=['History of Mental Illness'])  # drop the target variable column
y = df_numeric['History of Mental Illness']  # target variable

# split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)  

# train the random forest classifier
model = RandomForestClassifier(random_state=23)
model.fit(X_train, y_train)

# get the predictions for the test data
y_pred = model.predict(X_test)

In [6]:
# model performance evaluation

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# also get the cross validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.2f}")

Accuracy: 0.66
Confusion Matrix:
[[51868  5594]
 [22149  3143]]
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.90      0.79     57462
           1       0.36      0.12      0.18     25292

    accuracy                           0.66     82754
   macro avg       0.53      0.51      0.49     82754
weighted avg       0.60      0.66      0.60     82754

Cross-Validation Scores: [0.66491046 0.66621553 0.66507963 0.66703322 0.66439887]
Mean CV Score: 0.67


In [7]:
# save the model
#dump(model, 'mental_health_model.joblib')  # uncomment to save new model