In [79]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
import streamlit as st

In [80]:
# Load the dataset
file_path = "depression_anxiety_data.csv"
data = pd.read_csv(file_path)
data

Unnamed: 0,id,school_year,age,gender,bmi,who_bmi,phq_score,depression_severity,depressiveness,suicidal,depression_diagnosis,depression_treatment,gad_score,anxiety_severity,anxiousness,anxiety_diagnosis,anxiety_treatment,epworth_score,sleepiness
0,1,1,19,male,33.333333,Class I Obesity,9,Mild,False,False,False,False,11,Moderate,True,False,False,7.0,False
1,2,1,18,male,19.841270,Normal,8,Mild,False,False,False,False,5,Mild,False,False,False,14.0,True
2,3,1,19,male,25.102391,Overweight,8,Mild,False,False,False,False,6,Mild,False,False,False,6.0,False
3,4,1,18,female,23.738662,Normal,19,Moderately severe,True,True,False,False,15,Severe,True,False,False,11.0,True
4,5,1,18,male,25.617284,Overweight,6,Mild,False,False,False,False,14,Moderate,True,False,False,3.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,779,4,24,female,21.096191,Normal,6,Mild,False,False,False,False,1,None-minimal,False,False,False,3.0,False
779,780,4,22,male,25.308642,Overweight,4,None-minimal,False,False,False,False,6,Mild,False,False,False,3.0,False
780,781,4,22,male,22.720438,Normal,2,None-minimal,False,False,False,False,5,Mild,False,False,False,4.0,False
781,782,4,22,female,23.033168,Normal,17,Moderately severe,True,False,False,False,19,Severe,True,False,False,15.0,True


In [93]:
data['who_bmi'].unique()

array(['Class I Obesity', 'Normal', 'Overweight', 'Not Availble',
       'Class III Obesity', 'Underweight', 'Class II Obesity'],
      dtype=object)

## Preprocessing the Data

In [81]:
data.isna().sum()

id                      0
school_year             0
age                     0
gender                  0
bmi                     0
who_bmi                 0
phq_score               0
depression_severity     4
depressiveness          3
suicidal                1
depression_diagnosis    1
depression_treatment    4
gad_score               0
anxiety_severity        0
anxiousness             6
anxiety_diagnosis       4
anxiety_treatment       2
epworth_score           8
sleepiness              8
dtype: int64

In [82]:
# Drop unnecessary columns
data = data.drop(columns=["id"], errors='ignore')

# Handle missing values
imputer = SimpleImputer(strategy="most_frequent")
data["depression_severity"] = imputer.fit_transform(data[["depression_severity"]]).flatten()
data["depressiveness"] = imputer.fit_transform(data[["depressiveness"]]).flatten()
data["suicidal"] = imputer.fit_transform(data[["suicidal"]]).flatten()
data["depression_diagnosis"] = imputer.fit_transform(data[["depression_diagnosis"]]).flatten()
data["depression_treatment"] = imputer.fit_transform(data[["depression_treatment"]]).flatten()
data["anxiousness"] = imputer.fit_transform(data[["anxiousness"]]).flatten()
data["anxiety_diagnosis"] = imputer.fit_transform(data[["anxiety_diagnosis"]]).flatten()
data["anxiety_treatment"] = imputer.fit_transform(data[["anxiety_treatment"]]).flatten()
data["epworth_score"] = imputer.fit_transform(data[["epworth_score"]]).flatten()
data["sleepiness"] = imputer.fit_transform(data[["sleepiness"]]).flatten()

In [83]:
# Encode categorical variables
categorical_columns = data.select_dtypes(include=["object"]).columns.tolist()
categorical_columns.remove("depression_diagnosis")  # Target variable

one_hot_encoder = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
encoded_features = one_hot_encoder.fit_transform(data[categorical_columns])
encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns)

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Encode target variable
label_encoder = LabelEncoder()
data["depression_diagnosis"] = label_encoder.fit_transform(data["depression_diagnosis"])  # Yes = 1, No = 0

# Combine processed data
processed_data = pd.concat([data[["age", "depression_diagnosis"]], encoded_df], axis=1)

In [99]:
categorical_columns

['gender',
 'who_bmi',
 'depression_severity',
 'depressiveness',
 'suicidal',
 'depression_treatment',
 'anxiety_severity',
 'anxiousness',
 'anxiety_diagnosis',
 'anxiety_treatment',
 'sleepiness']

In [84]:
processed_data.nunique()

age                                      13
depression_diagnosis                      2
gender_male                               2
who_bmi_Class II Obesity                  2
who_bmi_Class III Obesity                 2
who_bmi_Normal                            2
who_bmi_Not Availble                      2
who_bmi_Overweight                        2
who_bmi_Underweight                       2
depression_severity_Moderate              2
depression_severity_Moderately severe     2
depression_severity_None-minimal          2
depression_severity_Severe                2
depression_severity_none                  2
depressiveness_True                       2
suicidal_True                             2
depression_treatment_True                 2
anxiety_severity_Mild                     2
anxiety_severity_Moderate                 2
anxiety_severity_None-minimal             2
anxiety_severity_Severe                   2
anxiousness_True                          2
anxiety_diagnosis_True          

## Splitting Data and Training all three models and comparing their scores

In [85]:
# Split the data
X = processed_data.drop(columns=["depression_diagnosis"])
y = processed_data["depression_diagnosis"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

#Train an XGBoost model
xgb_model=XGBClassifier()
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb=xgb_model.predict(X_test)

# Evaluate the model
rf_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_rf),
    "Precision": precision_score(y_test, y_pred_rf),
    "Recall": recall_score(y_test, y_pred_rf),
    "F1-score": f1_score(y_test, y_pred_rf),
}
print('Evaluaution of Random Forest Model is as follows:\n')
print(rf_metrics,'\n\n')

xgb_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_xgb),
    "Precision": precision_score(y_test, y_pred_xgb),
    "Recall": recall_score(y_test, y_pred_xgb),
    "F1-score": f1_score(y_test, y_pred_xgb)
}
print('Evaluaution of XGBoost Model is as follows:\n')
print(xgb_metrics,'\n\n')


Evaluaution of Random Forest Model is as follows:

{'Accuracy': 0.9426751592356688, 'Precision': np.float64(0.7142857142857143), 'Recall': np.float64(0.4166666666666667), 'F1-score': np.float64(0.5263157894736842)} 


Evaluaution of XGBoost Model is as follows:

{'Accuracy': 0.9490445859872612, 'Precision': np.float64(0.7), 'Recall': np.float64(0.5833333333333334), 'F1-score': np.float64(0.6363636363636364)} 




In [90]:
X

Unnamed: 0,age,gender_male,who_bmi_Class II Obesity,who_bmi_Class III Obesity,who_bmi_Normal,who_bmi_Not Availble,who_bmi_Overweight,who_bmi_Underweight,depression_severity_Moderate,depression_severity_Moderately severe,...,suicidal_True,depression_treatment_True,anxiety_severity_Mild,anxiety_severity_Moderate,anxiety_severity_None-minimal,anxiety_severity_Severe,anxiousness_True,anxiety_diagnosis_True,anxiety_treatment_True,sleepiness_True
0,19,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,18,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,19,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,18,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,24,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
779,22,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
780,22,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
781,22,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


### As XGBoost model has better evaluation metrics, we will use it for deployment

In [95]:
# Save model and encoders
pickle.dump(xgb_model, open("mental_health_model.pkl", "wb"))
pickle.dump(one_hot_encoder, open("one_hot_encoder.pkl", "wb"))
pickle.dump(label_encoder, open("label_encoder.pkl", "wb"))

# Inference function
def predict_mental_health(symptoms_dict):
    model = pickle.load(open("mental_health_model.pkl", "rb"))
    encoder = pickle.load(open("one_hot_encoder.pkl", "rb"))
    label_enc = pickle.load(open("label_encoder.pkl", "rb"))

    # Convert user input into a DataFrame
    user_df = pd.DataFrame([symptoms_dict])

    # Ensure all columns match training set (handle missing columns)
    missing_cols = set(encoder.get_feature_names_out()) - set(user_df.columns)
    for col in missing_cols:
        user_df[col] = 0  # Fill missing columns with 0

    # Reorder columns to match the training set
    user_df = user_df[encoder.get_feature_names_out()]

    # Predict and return the result
    prediction = model.predict(user_df)
    return label_enc.inverse_transform(prediction)[0]


### Building the StreamLit UI

In [9]:
# Load the saved model and encoders
model = pickle.load(open("mental_health_model.pkl", "rb"))
encoder = pickle.load(open("one_hot_encoder.pkl", "rb"))
label_enc = pickle.load(open("label_encoder.pkl", "rb"))

# Streamlit app title
st.title("Depression Diagnosis Predictor")

# Collect user inputs
st.sidebar.header("User Input Features")

# Function to collect user input
def user_input_features():
    gender = st.sidebar.selectbox("Select Gender", ['Male', 'Female', 'Other'])
    age = st.sidebar.number_input("Age", min_value=0, max_value=100, value=25)
    depression_severity = st.sidebar.selectbox("Depression Severity", ["None", "Mild", "Moderate", "Severe"])
    depressiveness = st.sidebar.selectbox("Depressiveness", ["Not at all", "Slightly", "Moderately", "Very much"])
    suicidal = st.sidebar.selectbox("Suicidal Thoughts", ["Never", "Rarely", "Sometimes", "Often"])
    depression_treatment = st.sidebar.selectbox("Depression Treatment", ["No", "Yes"])
    anxiousness = st.sidebar.selectbox("Anxiousness", ["Not at all", "Slightly", "Moderately", "Very much"])
    anxiety_diagnosis = st.sidebar.selectbox("Anxiety Diagnosis", ["No", "Yes"])
    anxiety_treatment = st.sidebar.selectbox("Anxiety Treatment", ["No", "Yes"])
    sleepiness = st.sidebar.selectbox("Sleepiness", ["Not at all", "Slightly", "Moderately", "Very much"])
    anxiety_severity = st.sidebar.selectbox("Anxiety Severity", ['Moderate', 'Mild', 'Severe', 'None-minimal'])
    who_bmi = st.sidebar.selectbox("Select BMI", ['Class I Obesity', 'Normal', 'Overweight', 'Not Available',
                                                  'Class III Obesity', 'Underweight', 'Class II Obesity'])

    # Create a dictionary of user inputs
    user_data = {
        "age": age,
        "gender": gender,
        "who_bmi": who_bmi,
        "depression_severity": depression_severity,
        "depressiveness": depressiveness,
        "suicidal": suicidal,
        "depression_treatment": depression_treatment,
        "anxiety_severity": anxiety_severity,
        "anxiousness": anxiousness,
        "anxiety_diagnosis": anxiety_diagnosis,
        "anxiety_treatment": anxiety_treatment,
        "sleepiness": sleepiness
    }

    return pd.DataFrame([user_data])

# Get user input
input_df = user_input_features()
# Display user input
st.subheader("User Input Features")
st.write(input_df)

# Preprocess user input
def preprocess_input(user_df):
    # Identify categorical columns (excluding 'age')
    categorical_columns = user_df.select_dtypes(include=["object"]).columns.tolist()
    
    # One-hot encode categorical features using the pre-trained encoder
    encoded_features = encoder.transform(user_df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
    # Drop categorical columns from user input
    user_df = user_df.drop(columns=categorical_columns).reset_index(drop=True)

    # Combine numerical and encoded categorical features
    final_input = pd.concat([user_df, encoded_df], axis=1)

    # Ensure all features match model expectations
    missing_cols = set(model.feature_names_in_) - set(final_input.columns)
    for col in missing_cols:
        final_input[col] = 0  # Add missing columns with default value 0

    # Reorder columns to match training order
    final_input = final_input[model.feature_names_in_]

    return final_input

# Preprocess the input
processed_input = preprocess_input(input_df)

# Predict using the model
if st.sidebar.button("Predict"):
    prediction = model.predict(processed_input)
    prediction_label = label_enc.inverse_transform(prediction)[0]
    st.subheader("Prediction")
    st.write(f"The model predicts: **{prediction_label}**")


