Mental Health Resrouce: https://www.kaggle.com/datasets/anweshaghosh123/mental-health-synthetic-dataset

In [1]:
import pandas as pd

mental_df = pd.read_csv("mental_health.csv")
mental_df.columns

Index(['User ID', 'Age', 'Gender', 'Symptoms', 'Duration (weeks)',
       'Previous Diagnosis', 'Therapy History', 'Medication',
       'Diagnosis / Condition', 'Suggested Therapy', 'Self-care Advice',
       'Urgency Level', 'Mood', 'Stress Level'],
      dtype='object')

In [2]:
mental_df.head()
mental_df = mental_df[['Age', 'Symptoms', 'Diagnosis / Condition', 'Previous Diagnosis', 'Self-care Advice', 'Suggested Therapy', 'Duration (weeks)']]

In [3]:
mental_df = mental_df.rename(columns= {
    'Diagnosis / Condition' : 'Diagnosis',
    'Self-care Advice': 'Self_Care_Advice',
    'Suggested Therapy': 'Suggested_Therapy',
    'Duration (weeks)': 'Duration',
    'Previous Diagnosis': 'Prev_Diagnosis'
})

In [4]:
symptoms_unique_values = mental_df['Symptoms'].unique()
therapy_unique_values = mental_df['Suggested_Therapy'].unique()
diagnosis_unique_values = mental_df['Diagnosis'].unique()
print(symptoms_unique_values)
print(therapy_unique_values)
print(diagnosis_unique_values)

['feeling anxious' 'excessive worry' 'trouble sleeping'
 'loss of interest in activities' 'panic attacks' 'lack of concentration'
 'feeling irritable' 'feeling sad' 'feeling overwhelmed']
['Support Groups' 'Cognitive Behavioral Therapy' 'Psychotherapy'
 'Mindfulness-Based Therapy' 'No Therapy Needed']
['Panic Disorder' 'Depression' 'Anxiety' 'Burnout' 'Stress']


A label encoder is a technique that converts non-numerical data into numerical values, 
which is useful for machine learning and data analysis. It's often used when working with categorical data, such as ordinal data, where there's a hierarchy among the values

In [5]:
mental_df.columns
print(len(mental_df['Symptoms']))
print(len(mental_df['Diagnosis']))
print(len(mental_df['Self_Care_Advice']))
print(len(mental_df['Suggested_Therapy']))
print(len(mental_df['Duration']))
print(len(mental_df['Prev_Diagnosis']))

5000
5000
5000
5000
5000
5000


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

def build_model():
    le_diagnosis = LabelEncoder()
    le_symtoms = LabelEncoder()
    le_self_care = LabelEncoder()
    le_therapy = LabelEncoder()
    # mental_df['Duration'] is already int style, we don't have to encode it
    
    mental_df['Diagnosis_encoded'] = le_diagnosis.fit_transform(mental_df['Diagnosis'])
    mental_df['Symptoms_encoded'] = le_symtoms.fit_transform(mental_df['Symptoms'])
    mental_df['Self_Care_Advice_encoded'] = le_self_care.fit_transform(mental_df['Self_Care_Advice'])
    mental_df['Suggested_Therapy_encoded'] = le_therapy.fit_transform(mental_df['Suggested_Therapy'])

    # Training Data
    X = mental_df[['Diagnosis_encoded', 'Symptoms_encoded']]
    y_self_care = mental_df['Self_Care_Advice_encoded']
    y_therapy = mental_df['Suggested_Therapy_encoded']

    X_train, X_test, y_self_care_train, y_self_care_test, y_therapy_train, y_therapy_test = train_test_split(X, y_self_care, y_therapy, test_size=0.2, random_state=42)
    # Train models for Self Care Advice and Suggested Therapy
    model_self_care = RandomForestClassifier()
    model_therapy = RandomForestClassifier()

    model_self_care.fit(X_train, y_self_care_train)
    model_therapy.fit(X_train, y_therapy_train)

    # Make predictions
    self_care_pred = model_self_care.predict(X_test)
    therapy_pred = model_therapy.predict(X_test)

    # Display classification reports
    print("Self Care Advice Classification Report:")
    self_care_report = classification_report(y_self_care_test, self_care_pred, target_names=le_self_care.classes_)
    print(self_care_report)
    print("\nSuggested Therapy Classification Report:")
    self_therapy_report = classification_report(y_therapy_test, therapy_pred, target_names=le_therapy.classes_)
    print(self_therapy_report)
    return model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy, self_care_report, self_therapy_report

# Initial Reports

### Self Care Advice Classification Report

| Self Care Advice       | Precision | Recall | F1-Score | Support |
|------------------------|-----------|--------|----------|---------|
| Breathing Exercises    | 0.20      | 0.17   | 0.18     | 166     |
| Exercise               | 0.17      | 0.24   | 0.20     | 181     |
| Journaling             | 0.32      | 0.14   | 0.20     | 191     |
| Meditation             | 0.11      | 0.03   | 0.04     | 116     |
| Take Breaks            | 0.17      | 0.33   | 0.23     | 164     |
| Talk to a Friend       | 0.22      | 0.21   | 0.21     | 182     |
| **Accuracy**           |           |        | 0.19     | 1000    |
| **Macro Avg**          | 0.20      | 0.19   | 0.18     | 1000    |
| **Weighted Avg**       | 0.21      | 0.19   | 0.18     | 1000    |

### Suggested Therapy Classification Report

| Suggested Therapy               | Precision | Recall | F1-Score | Support |
|---------------------------------|-----------|--------|----------|---------|
| Cognitive Behavioral Therapy    | 0.16      | 0.13   | 0.14     | 200     |
| Mindfulness-Based Therapy       | 0.14      | 0.03   | 0.05     | 189     |
| No Therapy Needed               | 0.17      | 0.09   | 0.11     | 187     |
| Psychotherapy                   | 0.17      | 0.31   | 0.22     | 202     |
| Support Groups                  | 0.24      | 0.34   | 0.28     | 222     |
| **Accuracy**                    |           |        | 0.19     | 1000    |
| **Macro Avg**                   | 0.17      | 0.18   | 0.16     | 1000    |
| **Weighted Avg**                | 0.18      | 0.19   | 0.17     | 1000    |



In [7]:
# model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy, self_care_report, self_therapy_report = build_model()
# Save model for purpose
import joblib
def save_model(model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy ):
    joblib.dump(model_self_care, "model_self_care.pkl")
    joblib.dump(model_therapy, "model_therapy.pkl")
    joblib.dump(le_diagnosis, "le_diagnosis.pkl")
    joblib.dump(le_symtoms, "le_symtoms.pkl")
    joblib.dump(le_self_care, "le_self_care.pkl")
    joblib.dump(le_therapy, "le_therapy.pkl")

# save_model(model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy)

Self Care Advice Classification Report:
                     precision    recall  f1-score   support

Breathing Exercises       0.16      0.16      0.16       166
           Exercise       0.21      0.24      0.23       181
         Journaling       0.21      0.18      0.19       191
         Meditation       0.11      0.08      0.09       116
        Take Breaks       0.16      0.21      0.18       164
   Talk to a Friend       0.18      0.16      0.17       182

           accuracy                           0.18      1000
          macro avg       0.17      0.17      0.17      1000
       weighted avg       0.18      0.18      0.18      1000


Suggested Therapy Classification Report:
                              precision    recall  f1-score   support

Cognitive Behavioral Therapy       0.21      0.18      0.20       200
   Mindfulness-Based Therapy       0.17      0.15      0.16       189
           No Therapy Needed       0.22      0.21      0.21       187
               Psychothe

(RandomForestClassifier(),
 RandomForestClassifier(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 '                     precision    recall  f1-score   support\n\nBreathing Exercises       0.16      0.16      0.16       166\n           Exercise       0.21      0.24      0.23       181\n         Journaling       0.21      0.18      0.19       191\n         Meditation       0.11      0.08      0.09       116\n        Take Breaks       0.16      0.21      0.18       164\n   Talk to a Friend       0.18      0.16      0.17       182\n\n           accuracy                           0.18      1000\n          macro avg       0.17      0.17      0.17      1000\n       weighted avg       0.18      0.18      0.18      1000\n',
 '                              precision    recall  f1-score   support\n\nCognitive Behavioral Therapy       0.21      0.18      0.20       200\n   Mindfulness-Based Therapy       0.17      0.15      0.16       189\n           No Therapy Needed    

In [None]:


def group_diagnosis(row):
    if row['Diagnosis'] in ['Panic Disorder', 'Anxiety']:
        return 'Anxiety Disorders'
    elif row['Diagnosis'] in ['Depression', 'Burnout']:
        return 'Mood Disorders'
    elif row['Diagnosis'] == 'Stress':
        return 'Stress-Related Disorders'

def test_build_model(mental_df):
    # Diagnosis Group
    mental_df["Diagnosis_Group"] = mental_df.apply(group_diagnosis, axis=1)

    le_diagnosis = LabelEncoder()
    le_symtoms = LabelEncoder()
    le_self_care = LabelEncoder()
    le_therapy = LabelEncoder()
    le_prev_diagnosis = LabelEncoder()
    le_diagnosis_group = LabelEncoder()

    mental_df['Diagnosis_encoded'] = le_diagnosis.fit_transform(mental_df['Diagnosis'])
    mental_df['Symptoms_encoded'] = le_symtoms.fit_transform(mental_df['Symptoms'])
    mental_df['Self_Care_Advice_encoded'] = le_self_care.fit_transform(mental_df['Self_Care_Advice'])
    mental_df['Suggested_Therapy_encoded'] = le_therapy.fit_transform(mental_df['Suggested_Therapy'])
    mental_df['Prev_Diagnosis_encoded'] = le_prev_diagnosis.fit_transform(mental_df['Prev_Diagnosis'])
    mental_df['Diagnosis_Group_encoded'] = le_diagnosis_group.fit_transform(mental_df['Diagnosis_Group'])
    
    # mental_df['Duration'] It is already mental_df we don't have to label codes it

    # Training Data with duration
    X = mental_df[['Age', 'Diagnosis_Group_encoded', 'Symptoms_encoded']] 
    y_self_care = mental_df['Self_Care_Advice_encoded']
    y_therapy = mental_df['Suggested_Therapy_encoded']

    X_train, X_test, y_self_care_train, y_self_care_test, y_therapy_train, y_therapy_test = train_test_split(X, y_self_care, y_therapy, test_size=0.2, random_state=42)
    
    # Train models for Self Care Advice and Suggested Therapy
    model_self_care = RandomForestClassifier()
    model_therapy = RandomForestClassifier()

    model_self_care.fit(X_train, y_self_care_train)
    model_therapy.fit(X_train, y_therapy_train)

    # Make predictions
    self_care_pred = model_self_care.predict(X_test)
    therapy_pred = model_therapy.predict(X_test)

    # Display classification reports
    print("Self Care Advice Classification Report:")
    self_care_report = classification_report(y_self_care_test, self_care_pred, target_names=le_self_care.classes_)
    print(self_care_report)
    print("\nSuggested Therapy Classification Report:")
    self_therapy_report = classification_report(y_therapy_test, therapy_pred, target_names=le_therapy.classes_)
    print(self_therapy_report)
    return model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy, self_care_report, self_therapy_report

test_build_model(mental_df)

In [30]:
import nltk
from nltk.corpus import wordnet
from itertools import chain
# What I am trying to do.
# Based on texts and topics, Create chains then finds vlaues.
def find_related_words():
    nltk.download('wordnet')

    words = {
        'Panic': 'panic',
        'Disorder': 'disorder',
        'Depression': 'depression',
        'Burnout': 'burnout',
        'Stress': 'stress'
    }

    # Initialize a dictionary to store the results
    related_words = {}

    for key, word in words.items():
        word_synsets = wordnet.synsets(word)
        similar_words = set(chain(*[synset.lemma_names() for synset in word_synsets]))
        related_words[key] = list(similar_words)

    return related_words

# Get the related words and print the dictionary
similar_words_dict = find_related_words()
print(similar_words_dict)

{'Panic': ['panic', 'scare', 'terror', 'affright'], 'Disorder': ['distract', 'upset', 'trouble', 'disorderliness', 'cark', 'disarray', 'disorder', 'perturb', 'disquiet', 'unhinge'], 'Depression': ['impression', 'clinical_depression', 'slump', 'Great_Depression', 'Depression', 'low', 'depressive_disorder', 'natural_depression', 'imprint', 'economic_crisis', 'depression'], 'Burnout': [], 'Stress': ['strain', 'emphasis', 'emphasize', 'tension', 'punctuate', 'focus', 'accentuate', 'try', 'tenseness', 'emphasise', 'stress', 'accent']}


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ykim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Result for related words
# {'Panic': ['panic', 'scare', 'terror', 'affright'], 
# 'Disorder': ['distract', 'upset', 'trouble', 'disorderliness', 'cark', 'disarray', 'disorder', 'perturb', 'disquiet', 'unhinge'], 
# 'Depression': ['impression', 'clinical_depression', 'slump', 'Great_Depression', 'Depression', 'low', 'depressive_disorder', 'natural_depression', 'imprint', 'economic_crisis', 'depression'], 
# 'Burnout': [], 
# 'Stress': ['strain', 'emphasis', 'emphasize', 'tension', 'punctuate', 'focus', 'accentuate', 'try', 'tenseness', 'emphasise', 'stress', 'accent']}


# Self Care Advice Classification Report with Based with durations.

| Self Care Advice       | Precision | Recall | F1-Score | Support |
|------------------------|-----------|--------|----------|---------|
| Breathing Exercises    | 0.16      | 0.15   | 0.15     | 166     |
| Exercise               | 0.18      | 0.18   | 0.18     | 181     |
| Journaling             | 0.20      | 0.20   | 0.20     | 191     |
| Meditation             | 0.09      | 0.07   | 0.08     | 116     |
| Take Breaks            | 0.14      | 0.14   | 0.14     | 164     |
| Talk to a Friend       | 0.16      | 0.18   | 0.17     | 182     |
| **Accuracy**           |           |        | 0.16     | 1000    |
| **Macro Avg**          | 0.15      | 0.15   | 0.15     | 1000    |
| **Weighted Avg**       | 0.16      | 0.16   | 0.16     | 1000    |

# Suggested Therapy Classification Report

| Suggested Therapy               | Precision | Recall | F1-Score | Support |
|---------------------------------|-----------|--------|----------|---------|
| Cognitive Behavioral Therapy    | 0.20      | 0.23   | 0.21     | 200     |
| Mindfulness-Based Therapy       | 0.17      | 0.16   | 0.16     | 189     |
| No Therapy Needed               | 0.21      | 0.17   | 0.19     | 187     |
| Psychotherapy                   | 0.22      | 0.23   | 0.22     | 202     |
| Support Groups                  | 0.26      | 0.27   | 0.26     | 222     |
| **Accuracy**                    |           |        | 0.21     | 1000    |
| **Macro Avg**                   | 0.21      | 0.21   | 0.21     | 1000    |
| **Weighted Avg**


In [83]:
counsel_df = pd.read_csv("counselchat-data.csv")

from collections import Counter

print("Orignal length" + str(len(counsel_df)))
counsel_df = counsel_df[['questionText', 'topics','answerText']]
all_words = ' '.join(counsel_df['topics'].astype(str)).replace(',', '').split()

# Count the frequency of each word
word_count = Counter(all_words)

# ['Panic Disorder' 'Depression' 'Anxiety' 'Burnout' 'Stress']

import nltk
from nltk.corpus import wordnet

# Ensure you've downloaded the WordNet corpus

# Display the word frequencies
print("\nWord frequencies in the 'topics' column:")
found_selected_count = 0
stress_count = 0
depression_count = 0
disorder_count = 0
anxiety_count = 0
burn_out_count = 0
for word, count in word_count.items():
    
    if word.__contains__("Stress"):
        stress_count +=count
    elif word.__contains__("Depression"):
        depression_count += count
    elif word.__contains__("Disorder"):
        disorder_count += count
    elif word.__contains__("Anxiety"):
        anxiety_count += count
    elif word.__contains__("Burnout"):
        burn_out_count += count
    else:
        print(f"{word} :  {count}")
found_selected_count = stress_count + depression_count + disorder_count + anxiety_count + burn_out_count
print("total counts" + str(len(counsel_df)))
print("stress_count", stress_count)
print("depression_count", depression_count)
print("disorder_count", disorder_count)
print("anxiety_count", anxiety_count)
print("burn_out_count", burn_out_count)
print("Found selcted count", found_selected_count)



Orignal length1482

Word frequencies in the 'topics' column:
Family :  119
Conflict :  91
Substance :  14
AbuseAddiction :  9
Behavioral :  49
ChangeSocial :  3
Relationships :  214
Relationship :  45
Dissolution :  68
Anger :  32
Management :  25
Sleep :  10
Improvement :  17
Professional :  34
EthicsLegal :  6
& :  45
Regulatory :  20
Social :  41
RelationshipsMarriage :  11
MarriageIntimacy :  26
Domestic :  9
ViolenceAnger :  2
ManagementFamily :  3
Human :  50
Sexuality :  33
ManagementSleep :  2
Military :  3
Issues :  10
RelationshipsDomestic :  3
Violence :  10
ViolenceRelationship :  1
Marriage :  25
Grief :  20
and :  23
Loss :  9
ConflictChildren :  1
Adolescents :  9
MarriageRelationship :  4
TraumaHuman :  1
RelationshipsIntimacy :  42
ManagementParenting :  1
Intimacy :  17
Workplace :  9
SexualityMarriage :  3
LGBTQ :  29
SpiritualityFamily :  2
Ethics :  23
ViolenceRelationships :  1
ConflictRelationships :  5
Self-esteem :  29
Self-esteemRelationships :  12
Parenting :

In [101]:
# https://my.clevelandclinic.org/health/diseases/22295-mental-health-disorders
counsel_df = pd.read_csv("counselchat-data.csv")
target_Keywords = {
    'Anxiety Disorders': ['panic disorder', 'anxiety'],
    'Mood Disorders': ['depression', 'burnout'],
    'Stress-Related Disorders': ['stress', 'PTSD']
}

similar_words_dict = {
    'panic': ['panic', 'scare', 'terror', 'affright'],
    'disorder': ['distract', 'upset', 'trouble', 'disorderliness', 'cark', 'disarray', 'disorder', 'perturb', 'disquiet', 'unhinge'],
    'depression': ['impression', 'clinical_depression', 'slump', 'Great_Depression', 'depression', 'low', 'depressive_disorder', 'natural_depression', 'imprint', 'economic_crisis', 'depression'],
    'burnout': [],
    'stress': ['strain', 'emphasis', 'emphasize', 'tension', 'punctuate', 'focus', 'accentuate', 'try', 'tenseness', 'emphasise', 'stress', 'accent']
}

def group_diagnosis(row):
    all_words = str(row['topics']).lower().split()

    for disorder, keywords in target_Keywords.items():
        for keyword in keywords:
            if any(keyword in word for word in all_words):
                return disorder
            if keyword in similar_words_dict:
                # Check if any of the similar words match
                if any(sim_word in all_words for sim_word in similar_words_dict[keyword]):
                    return disorder
    return None
# Original 120.
print(similar_words_dict)
counsel_df["topics_extracted"] = counsel_df.apply(group_diagnosis, axis=1)
print(counsel_df['topics_extracted'])
counsel_df_without_topic = counsel_df[counsel_df["topics_extracted"].isna()]
print(counsel_df_without_topic['topics'].unique())


{'panic': ['panic', 'scare', 'terror', 'affright'], 'disorder': ['distract', 'upset', 'trouble', 'disorderliness', 'cark', 'disarray', 'disorder', 'perturb', 'disquiet', 'unhinge'], 'depression': ['impression', 'clinical_depression', 'slump', 'Great_Depression', 'depression', 'low', 'depressive_disorder', 'natural_depression', 'imprint', 'economic_crisis', 'depression'], 'burnout': [], 'stress': ['strain', 'emphasis', 'emphasize', 'tension', 'punctuate', 'focus', 'accentuate', 'try', 'tenseness', 'emphasise', 'stress', 'accent']}
0                    None
1                    None
2                    None
3                    None
4       Anxiety Disorders
              ...        
1477                 None
1478                 None
1479                 None
1480       Mood Disorders
1481    Anxiety Disorders
Name: topics_extracted, Length: 1482, dtype: object
['Family Conflict' 'Substance Abuse,Addiction'
 'Behavioral Change,Social Relationships' 'Relationship Dissolution '
 'Anger M