<a href="https://colab.research.google.com/github/NasimSadeghi98/basic-chatbot/blob/main/notebooks/mental_health_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SECTION 1: SETUP & INSTALLATION
# Run this section once per session

In [None]:
# Install required packages
!pip install bertopic sentence-transformers umap-learn hdbscan plotly scikit-learn -q

# Import libraries
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
from google.colab import drive
import plotly.express as px

print("✓ Libraries installed and imported successfully!")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


✓ Libraries installed and imported successfully!


In [None]:
# SECTION 2: MOUNT GOOGLE DRIVE & SETUP DIRECTORIES
# Run this section once per session

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Create a directory for saving results (change path if needed)
save_dir = '/content/drive/MyDrive/MentalHealthResearch'
os.makedirs(save_dir, exist_ok=True)

print(f"✓ Google Drive mounted!")
print(f"✓ Results will be saved to: {save_dir}")

Mounted at /content/drive
✓ Google Drive mounted!
✓ Results will be saved to: /content/drive/MyDrive/MentalHealthResearch


In [None]:
# SECTION 3: LOAD DATA
# Run this section once per session

In [None]:
# Load your dataset (adjust path if needed)
file_path = '/content/drive/MyDrive/mental_health_balanced (1).csv'

# Try loading the data
try:
    df = pd.read_csv(file_path)
    print(f"✓ Dataset loaded successfully!")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"\n  First few rows:")
    print(df.head())
except FileNotFoundError:
    print(f"❌ File not found at: {file_path}")
    print("Please update the file_path variable with the correct location")

✓ Dataset loaded successfully!
  Shape: (13000, 2)
  Columns: ['condition', 'text']

  First few rows:
    condition                                               text
0        PTSD  I am so fucking sad I dont know why Im writing...
1      Stress  When I get stressed at work I just want to hid...
2  Loneliness  why do i have to be always alone ? Im tired of...
3        ADHD  I just showered for the first time in A MONTH!...
4     Anxiety  I got a 4.0 this semester in college! I dont e...


In [None]:
# SECTION 4: DATA EXPLORATION
# Run this to understand your data

In [None]:
# Basic statistics
print("\n" + "="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"Total posts: {len(df)}")
print(f"\nCondition distribution:")
print(df['condition'].value_counts())
print(f"\nMissing values:")
print(df.isnull().sum())

# Check text lengths
df['text_length'] = df['text'].str.len()
print(f"\nText length statistics:")
print(df['text_length'].describe())


DATASET OVERVIEW
Total posts: 13000

Condition distribution:
condition
PTSD              1000
Stress            1000
Loneliness        1000
ADHD              1000
Anxiety           1000
Schizophrenia     1000
OCD               1000
Suicidal          1000
Depression        1000
Normal            1000
EatingDisorder    1000
Addiction         1000
Bipolar           1000
Name: count, dtype: int64

Missing values:
condition    0
text         0
dtype: int64

Text length statistics:
count    13000.000000
mean       785.630231
std       1083.629130
min         51.000000
25%        184.000000
50%        492.000000
75%        969.000000
max      21798.000000
Name: text_length, dtype: float64


In [None]:
# SECTION 5: DATA PREPROCESSING
# Run this to clean your data

In [None]:
# SECTION 5: DATA PREPROCESSING
df_clean = df.dropna(subset=['text', 'condition'])
df_clean = df_clean[df_clean['text'].str.len() > 20]

# Protect ED abbreviation
df_clean['text'] = df_clean['text'].str.replace(r'\bED\b', 'eating_disorder', regex=True, case=False)

docs = df_clean['text'].tolist()
conditions = df_clean['condition'].tolist()

print(f"✓ Data cleaned: {len(docs)} posts")
df_clean.to_csv(f'{save_dir}/cleaned_data.csv', index=False)

✓ Data cleaned: 13000 posts


In [None]:
# SECTION 6: SETUP BERTOPIC WITH STOPWORD REMOVAL
# Run this to configure the model

In [None]:
# ============================================================================
# SECTION 6: SETUP BERTOPIC WITH COMPREHENSIVE STOPWORD REMOVAL
# FINAL VERSION - No more updates needed!
# ============================================================================

# FINAL COMPREHENSIVE stopword list
stopwords = [
    # Articles, conjunctions, prepositions
    'the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'it', 'for', 'on',
    'with', 'as', 'was', 'at', 'by', 'an', 'be', 'this', 'are', 'from',
    'or', 'have', 'but', 'not', 'all', 'can', 'has', 'had',
    'were', 'there', 'been', 'if', 'more', 'when', 'will', 'would',
    'who', 'so', 'no', 'what', 'up', 'out', 'about', 'into', 'than',
    'some', 'could', 'time', 'only', 'other', 'may', 'way',
    'these', 'two', 'also', 'which', 'do', 'their',

    # Pronouns
    'i', 'you', 'he', 'she', 'we', 'they', 'it', 'me', 'him', 'her', 'them', 'us',
    'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs',
    'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves',

    # Common verbs (expanded)
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing', 'done',
    'feel', 'feels', 'felt', 'feeling',
    'like', 'liked', 'liking',
    'know', 'knows', 'knew', 'knowing',
    'think', 'thinks', 'thought', 'thinking',
    'want', 'wants', 'wanted', 'wanting',
    'need', 'needs', 'needed', 'needing',
    'try', 'tries', 'tried', 'trying',
    'get', 'gets', 'got', 'getting',
    'go', 'goes', 'went', 'going',
    'make', 'makes', 'made', 'making',
    'see', 'sees', 'saw', 'seeing',
    'come', 'comes', 'came', 'coming',
    'tell', 'tells', 'told', 'telling',
    'ask', 'asks', 'asked', 'asking',
    'say', 'says', 'said', 'saying',
    'use', 'uses', 'used', 'using',
    'give', 'gives', 'gave', 'giving',
    'find', 'finds', 'found', 'finding',
    'take', 'takes', 'took', 'taking',
    'look', 'looks', 'looked', 'looking',
    'work', 'works', 'worked', 'working',
    'call', 'calls', 'called', 'calling',
    'seem', 'seems', 'seemed', 'seeming',
    'leave', 'leaves', 'left', 'leaving',
    'put', 'puts', 'putting',
    'mean', 'means', 'meant', 'meaning',
    'keep', 'keeps', 'kept', 'keeping',
    'let', 'lets', 'letting',
    'begin', 'begins', 'began', 'beginning',
    'help', 'helps', 'helped', 'helping',
    'show', 'shows', 'showed', 'showing',
    'hear', 'hears', 'heard', 'hearing',
    'play', 'plays', 'played', 'playing',
    'run', 'runs', 'ran', 'running',
    'move', 'moves', 'moved', 'moving',
    'start', 'starts', 'started', 'starting',
    'live', 'lives', 'lived', 'living',
    'believe', 'believes', 'believed', 'believing',
    'bring', 'brings', 'brought', 'bringing',
    'happen', 'happens', 'happened', 'happening',
    'write', 'writes', 'wrote', 'writing',
    'sit', 'sits', 'sat', 'sitting',
    'stand', 'stands', 'stood', 'standing',
    'lose', 'loses', 'lost', 'losing',
    'pay', 'pays', 'paid', 'paying',
    'meet', 'meets', 'met', 'meeting',
    'include', 'includes', 'included', 'including',
    'continue', 'continues', 'continued', 'continuing',
    'set', 'sets', 'setting',
    'learn', 'learns', 'learned', 'learning',
    'change', 'changes', 'changed', 'changing',
    'lead', 'leads', 'led', 'leading',
    'understand', 'understands', 'understood', 'understanding',
    'watch', 'watches', 'watched', 'watching',
    'follow', 'follows', 'followed', 'following',
    'stop', 'stops', 'stopped', 'stopping',
    'create', 'creates', 'created', 'creating',
    'speak', 'speaks', 'spoke', 'speaking',
    'read', 'reads', 'reading',
    'spend', 'spends', 'spent', 'spending',
    'grow', 'grows', 'grew', 'growing',
    'open', 'opens', 'opened', 'opening',
    'walk', 'walks', 'walked', 'walking',
    'win', 'wins', 'won', 'winning',
    'teach', 'teaches', 'taught', 'teaching',
    'offer', 'offers', 'offered', 'offering',
    'remember', 'remembers', 'remembered', 'remembering',
    'consider', 'considers', 'considered', 'considering',
    'appear', 'appears', 'appeared', 'appearing',
    'buy', 'buys', 'bought', 'buying',
    'wait', 'waits', 'waited', 'waiting',
    'serve', 'serves', 'served', 'serving',
    'die', 'dies', 'died', 'dying',
    'send', 'sends', 'sent', 'sending',
    'expect', 'expects', 'expected', 'expecting',
    'build', 'builds', 'built', 'building',
    'stay', 'stays', 'stayed', 'staying',
    'fall', 'falls', 'fell', 'falling',
    'cut', 'cuts', 'cutting',
    'reach', 'reaches', 'reached', 'reaching',
    'kill', 'kills', 'killed', 'killing',
    'raise', 'raises', 'raised', 'raising',
    'pass', 'passes', 'passed', 'passing',
    'sell', 'sells', 'sold', 'selling',
    'decide', 'decides', 'decided', 'deciding',
    'return', 'returns', 'returned', 'returning',
    'explain', 'explains', 'explained', 'explaining',
    'hope', 'hopes', 'hoped', 'hoping',
    'develop', 'develops', 'developed', 'developing',
    'carry', 'carries', 'carried', 'carrying',
    'break', 'breaks', 'broke', 'breaking',
    'receive', 'receives', 'received', 'receiving',
    'agree', 'agrees', 'agreed', 'agreeing',
    'support', 'supports', 'supported', 'supporting',
    'hit', 'hits', 'hitting',
    'produce', 'produces', 'produced', 'producing',
    'cover', 'covers', 'covered', 'covering',
    'catch', 'catches', 'caught', 'catching',
    'draw', 'draws', 'drew', 'drawing',

    # Person references
    'people', 'person', 'someone', 'anyone', 'everyone', 'nobody', 'somebody',
    'guy', 'girl', 'man', 'woman', 'boy', 'kid', 'kids',
    'mom', 'dad', 'mother', 'father', 'parent', 'parents',
    'son', 'daughter', 'brother', 'sister', 'family', 'friend', 'friends',

    # Common nouns (vague)
    'thing', 'things', 'stuff', 'something', 'anything', 'nothing', 'everything',
    'way', 'ways', 'part', 'parts', 'place', 'places',
    'case', 'cases', 'point', 'points', 'group', 'groups',
    'problem', 'problems', 'fact', 'facts', 'hand', 'hands',
    'eye', 'eyes', 'head', 'face', 'side', 'sides',
    'life', 'end', 'world', 'area', 'body', 'book', 'business',
    'child', 'children', 'company', 'country', 'course', 'door',

    # Quantifiers
    'one', 'two', 'three', 'four', 'five',
    'first', 'second', 'third', 'last',
    'lot', 'lots', 'many', 'much', 'few', 'several', 'bit', 'little',
    'more', 'most', 'less', 'least', 'enough',
    'all', 'both', 'each', 'every', 'another', 'other', 'others',
    'such', 'same', 'own', 'certain',

    # Time/temporal
    'now', 'then', 'today', 'yesterday', 'tomorrow',
    'day', 'days', 'week', 'weeks', 'month', 'months', 'year', 'years',
    'time', 'times', 'moment', 'moments',
    'always', 'never', 'sometimes', 'often', 'usually', 'rarely',
    'already', 'yet', 'still', 'again', 'once', 'ever',
    'soon', 'later', 'early', 'late', 'long', 'ago',
    'before', 'after', 'during', 'while', 'since', 'until',

    # Place/location
    'here', 'there', 'where', 'everywhere', 'anywhere', 'nowhere', 'somewhere',
    'home', 'house', 'room', 'place', 'area', 'world',
    'back', 'front', 'top', 'bottom', 'inside', 'outside',

    # Adjectives (vague/common)
    'good', 'bad', 'better', 'best', 'worse', 'worst',
    'great', 'small', 'large', 'big', 'little',
    'high', 'low', 'long', 'short', 'old', 'new', 'young',
    'right', 'wrong', 'different', 'same', 'sure',
    'important', 'possible', 'able', 'available', 'likely',
    'clear', 'certain', 'hard', 'easy', 'difficult',
    'early', 'late', 'far', 'near', 'close',
    'free', 'simple', 'general', 'special', 'particular',
    'whole', 'full', 'true', 'real', 'similar',

    # Adverbs/modifiers
    'very', 'really', 'quite', 'too', 'so', 'much', 'well',
    'also', 'just', 'even', 'still', 'only', 'rather',
    'actually', 'basically', 'literally', 'honestly', 'definitely',
    'probably', 'perhaps', 'maybe', 'possibly',
    'almost', 'nearly', 'hardly', 'barely',
    'suddenly', 'finally', 'quickly', 'slowly',
    'exactly', 'directly', 'especially', 'particularly',
    'usually', 'generally', 'normally', 'simply',
    'completely', 'totally', 'entirely', 'absolutely',
    'fairly', 'pretty', 'highly',
    'clearly', 'certainly', 'obviously',
    'together', 'alone', 'apart',

    # Question words
    'how', 'why', 'what', 'when', 'where', 'who', 'which', 'whose', 'whom',

    # Conjunctions/connectors
    'because', 'since', 'though', 'although', 'while',
    'however', 'therefore', 'thus', 'hence', 'otherwise',
    'moreover', 'furthermore', 'besides', 'instead',
    'else', 'either', 'neither', 'whether',

    # Modal/auxiliary
    'can', 'could', 'may', 'might', 'must',
    'should', 'would', 'will', 'shall',

    # Contractions
    'im', 'ive', 'id', 'ill', 'youre', 'youve', 'youd', 'youll',
    'hes', 'shes', 'its', 'weve', 'theyre', 'theyve', 'theyd', 'theyll',
    'dont', 'doesnt', 'didnt', 'wont', 'wouldnt', 'shouldnt',
    'cant', 'couldnt', 'mustnt', 'mightnt',
    'isnt', 'arent', 'wasnt', 'werent', 'hasnt', 'havent', 'hadnt',

    # Negations
    'not', 'no', 'none', 'nothing', 'nobody', 'nowhere', 'never', 'neither',

    # Prepositions
    'in', 'on', 'at', 'to', 'for', 'of', 'with', 'from', 'by',
    'about', 'into', 'through', 'over', 'under', 'above', 'below',
    'between', 'among', 'around', 'against', 'across', 'along',
    'behind', 'beyond', 'beside', 'near', 'off', 'onto', 'upon',
    'toward', 'towards', 'within', 'without', 'throughout',

    # Demonstratives
    'this', 'that', 'these', 'those',

    # Reddit-specific
    'reddit', 'post', 'posts', 'comment', 'comments', 'sub', 'subreddit',
    'edit', 'update', 'deleted', 'removed', 'op', 'tldr',
    'please', 'thank', 'thanks', 'sorry', 'excuse',

    # Informal/slang
    'gonna', 'wanna', 'gotta', 'kinda', 'sorta', 'cause', 'cuz',
    'yeah', 'yep', 'yup', 'nah', 'nope', 'ok', 'okay', 'alright',
    'hey', 'hi', 'hello', 'bye', 'wow', 'lol', 'omg', 'tbh',

    # Filler/discourse markers
    'well', 'oh', 'ah', 'um', 'uh', 'er', 'hmm',
    'anyway', 'anyways',

    # Word fragments from tokenization (IMPORTANT!)
    'ed', 'ing', 've', 'll', 're', 's', 'd', 't', 'm',
    'nt', 'ly', 'er', 'en', 'es', 'al', 'le', 'an', 'ar',

    # Additional common words
    'love', 'hate', 'talk', 'job', 'age', 'men', 'fuck', 'fucking',
]

# Remove duplicates
stopwords = list(set([word.lower() for word in stopwords]))

print(f"✓ FINAL comprehensive stopword list created!")
print(f"  Total stopwords: {len(stopwords)}")

# Create vectorizer
vectorizer_model = CountVectorizer(
    stop_words=stopwords,
    min_df=5,
    ngram_range=(1, 2),
    token_pattern=r'\b[a-z]{3,}\b'  # Minimum 3 letters
)

# Embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("✓ BERTopic components configured!")
print("✓ This is the FINAL version - no more stopword updates needed!")

✓ FINAL comprehensive stopword list created!
  Total stopwords: 877


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ BERTopic components configured!
✓ This is the FINAL version - no more stopword updates needed!


In [None]:
# SECTION 7A: RUN UNSUPERVISED BERTOPIC (ENTIRE DATASET)
# This discovers topics across ALL mental health conditions

In [None]:
# SECTION 7B: VISUALIZE OVERALL TOPICS

In [None]:
# Topic distribution
fig_topics = topic_model_all.visualize_topics()
fig_topics.write_html(f'{save_dir}/topics_visualization_all.html')
print("✓ Topic visualization saved!")

# Most representative documents per topic
fig_docs = topic_model_all.visualize_documents(docs, reduced_embeddings=None, sample=0.3)
fig_docs.write_html(f'{save_dir}/documents_visualization_all.html')
print("✓ Document visualization saved!")

# Barchart of top topics
fig_barchart = topic_model_all.visualize_barchart(top_n_topics=15)
fig_barchart.write_html(f'{save_dir}/barchart_all.html')
print("✓ Barchart saved!")

✓ Topic visualization saved!
✓ Document visualization saved!
✓ Barchart saved!


In [None]:
# SECTION 8: RUN CONDITION-SPECIFIC BERTOPIC
# This finds topics within EACH mental health condition separately

In [None]:
# ============================================================================
# SECTION 8: RUN CONDITION-SPECIFIC BERTOPIC
# This finds topics within EACH mental health condition separately
# ============================================================================

print("\n" + "="*60)
print("RUNNING BERTOPIC FOR EACH CONDITION")
print("="*60)

# Dictionary to store models for each condition
condition_models = {}
condition_results = {}

# Get unique conditions
unique_conditions = df_clean['condition'].unique()

for condition in unique_conditions:
    print(f"\n--- Processing: {condition} ---")

    # Filter data for this condition
    condition_docs = df_clean[df_clean['condition'] == condition]['text'].tolist()

    if len(condition_docs) < 50:  # Skip if too few documents
        print(f"  Skipping {condition} - only {len(condition_docs)} posts")
        continue

    try:
        # Create a FRESH vectorizer for each condition with flexible settings
        cv = CountVectorizer(
            stop_words=stopwords,
            min_df=2,  # Very lenient
            ngram_range=(1, 2)
        )

        # Create and fit model for this condition
        topic_model_cond = BERTopic(
            embedding_model=embedding_model,
            vectorizer_model=cv,
            min_topic_size=15,
            nr_topics='auto',
            verbose=False
        )

        topics_cond, probs_cond = topic_model_cond.fit_transform(condition_docs)

        # Save model
        topic_model_cond.save(f'{save_dir}/bertopic_model_{condition}')

        # Store results
        condition_models[condition] = topic_model_cond
        condition_results[condition] = {
            'topics': topics_cond,
            'probs': probs_cond,
            'n_topics': len(set(topics_cond)) - 1,  # Exclude outlier topic
            'docs': condition_docs
        }

        print(f"  ✓ {condition}: {condition_results[condition]['n_topics']} topics found")

        # Save visualization
        try:
            fig = topic_model_cond.visualize_barchart(top_n_topics=10)
            fig.write_html(f'{save_dir}/barchart_{condition}.html')
        except:
            pass

    except Exception as e:
        print(f"  X Error in {condition}: {e}")
        continue

print(f"\n✓ Processing complete!")
print(f"Successfully processed: {len(condition_models)} out of {len(unique_conditions)} conditions")

# Save summary
if condition_models:
    with open(f'{save_dir}/condition_models_summary.txt', 'w') as f:
        for cond, results in condition_results.items():
            f.write(f"{cond}: {results['n_topics']} topics\n")
    print("✓ Summary saved!")
else:
    print("❌ No conditions were successfully processed")


RUNNING BERTOPIC FOR EACH CONDITION

--- Processing: PTSD ---




  ✓ PTSD: 2 topics found

--- Processing: Stress ---




  ✓ Stress: 2 topics found

--- Processing: Loneliness ---




  ✓ Loneliness: 10 topics found

--- Processing: ADHD ---




  ✓ ADHD: 19 topics found

--- Processing: Anxiety ---




  ✓ Anxiety: 22 topics found

--- Processing: Schizophrenia ---




  ✓ Schizophrenia: 14 topics found

--- Processing: OCD ---




  ✓ OCD: 27 topics found

--- Processing: Suicidal ---




  ✓ Suicidal: 19 topics found

--- Processing: Depression ---




  ✓ Depression: 13 topics found

--- Processing: Normal ---




  ✓ Normal: 16 topics found

--- Processing: EatingDisorder ---




  ✓ EatingDisorder: 4 topics found

--- Processing: Addiction ---




  ✓ Addiction: 18 topics found

--- Processing: Bipolar ---




  ✓ Bipolar: 17 topics found

✓ Processing complete!
Successfully processed: 13 out of 13 conditions
✓ Summary saved!


In [None]:
# SECTION 8B: VISUALIZE CONDITION-SPECIFIC TOPICS

In [None]:
# ============================================================================
# CHECK CURRENT SECTION 8 RESULTS (Run immediately after Section 8)
# ============================================================================

print("="*60)
print("CHECKING CURRENT CONDITION MODELS")
print("="*60)

for condition, model in condition_models.items():
    print(f"\n{'='*40}")
    print(f"{condition.upper()}")
    print(f"{'='*40}")

    topic_info = model.get_topic_info()
    print(f"Number of topics: {len(topic_info[topic_info['Topic'] != -1])}")

    # Show each topic's keywords
    for idx, row in topic_info.iterrows():
        if row['Topic'] != -1:  # Skip outliers
            topic_id = row['Topic']
            count = row['Count']
            keywords = model.get_topic(topic_id)[:8]
            keyword_str = ', '.join([word for word, score in keywords])
            print(f"\n  Topic {topic_id} ({count} posts):")
            print(f"    Keywords: {keyword_str}")

    # Show outlier count
    outliers = topic_info[topic_info['Topic'] == -1]
    if len(outliers) > 0:
        print(f"\n  Outliers: {outliers['Count'].values[0]} posts")

print("\n" + "="*60)

CHECKING CURRENT CONDITION MODELS

PTSD
Number of topics: 13

  Topic 0 (141 posts):
    Keywords: sexual, sex, raped, abuse, rape, assault, trauma, any

  Topic 1 (115 posts):
    Keywords: ptsd, trauma, traumatic, annoying, war, joke, talking, word

  Topic 2 (105 posts):
    Keywords: ptsd, down, panic, diagnosis, therapist, attack, symptoms, office

  Topic 3 (63 posts):
    Keywords: miss, personality, anybody, becoming, relate, loved, turned, july

  Topic 4 (59 posts):
    Keywords: victim, trauma, survivor, stronger, abusers, survivors, victims, abuse

  Topic 5 (56 posts):
    Keywords: trauma, triggered, triggers, streets, trigger, memory, occurred, theres

  Topic 6 (51 posts):
    Keywords: fireworks, professionals, medical professionals, flee, medical, reason, survival, environment

  Topic 7 (36 posts):
    Keywords: memory, childhood, memory loss, memories, loss, ptsd, forget, recall

  Topic 8 (29 posts):
    Keywords: sharp, stupid, mentally, ptsd, relate, functioning,

In [None]:


import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("="*60)
print("CREATING CONDITION-SPECIFIC VISUALIZATIONS")
print("="*60)

# Generate barchart for each condition
for condition, model in condition_models.items():
    print(f"\nGenerating visualization for {condition}...")

    try:
        # Get topic info
        topic_info = model.get_topic_info()
        n_topics = len(topic_info[topic_info['Topic'] != -1])

        if n_topics > 0:
            # Barchart
            fig = model.visualize_barchart(top_n_topics=min(10, n_topics))
            fig.write_html(f'{save_dir}/barchart_{condition}_UPDATED.html')
            fig.show()
            print(f"  ✓ {condition}: {n_topics} topics visualized")
        else:
            print(f"  ⚠️ {condition}: No topics to visualize")
    except Exception as e:
        print(f"  ✗ Error with {condition}: {e}")

print("\n✓ All condition visualizations saved!")

CREATING CONDITION-SPECIFIC VISUALIZATIONS

Generating visualization for PTSD...


  ✓ PTSD: 13 topics visualized

Generating visualization for Stress...


  ✓ Stress: 3 topics visualized

Generating visualization for Loneliness...


  ✓ Loneliness: 16 topics visualized

Generating visualization for ADHD...


  ✓ ADHD: 19 topics visualized

Generating visualization for Anxiety...


  ✓ Anxiety: 21 topics visualized

Generating visualization for Schizophrenia...


  ✓ Schizophrenia: 12 topics visualized

Generating visualization for OCD...


  ✓ OCD: 29 topics visualized

Generating visualization for Suicidal...


  ✓ Suicidal: 19 topics visualized

Generating visualization for Depression...


  ✓ Depression: 17 topics visualized

Generating visualization for Normal...


  ✓ Normal: 6 topics visualized

Generating visualization for EatingDisorder...


  ✓ EatingDisorder: 16 topics visualized

Generating visualization for Addiction...


  ✓ Addiction: 15 topics visualized

Generating visualization for Bipolar...


  ✓ Bipolar: 18 topics visualized

✓ All condition visualizations saved!


In [None]:
# Check PTSD current results
ptsd_model = condition_models['PTSD']
ptsd_info = ptsd_model.get_topic_info()

print("PTSD Topics (Current):")
print(ptsd_info)

print("\nPTSD Topic Keywords:")
for topic_id in ptsd_info['Topic']:
    if topic_id != -1:  # Skip outliers
        keywords = ptsd_model.get_topic(topic_id)[:5]
        print(f"Topic {topic_id}: {keywords}")

PTSD Topics (Current):
    Topic  Count                                               Name  \
0      -1    272                         -1_trauma_ptsd_husband_any   
1       0    141                           0_sexual_sex_raped_abuse   
2       1    115                   1_ptsd_trauma_traumatic_annoying   
3       2    105                        2_ptsd_down_panic_diagnosis   
4       3     63                3_miss_personality_anybody_becoming   
5       4     59                  4_victim_trauma_survivor_stronger   
6       5     56                5_trauma_triggered_triggers_streets   
7       6     51  6_fireworks_professionals_medical professional...   
8       7     36            7_memory_childhood_memory loss_memories   
9       8     29                       8_sharp_stupid_mentally_ptsd   
10      9     21                    9_suicide_policy_rptsd_suicidal   
11     10     18                10_dead_woke_flashbacks_grandmother   
12     11     17                       11_wish_survive

In [None]:
# SECTION 9: COMPARE TOPICS ACROSS CONDITIONS

In [None]:
print("\n" + "="*60)
print("TOPIC COMPARISON ACROSS CONDITIONS")
print("="*60)

# Create comparison dataframe
comparison_data = []

for condition, results in condition_results.items():
    model = condition_models[condition]
    topic_info = model.get_topic_info()

    for idx, row in topic_info.iterrows():
        if row['Topic'] != -1:  # Skip outlier topic
            comparison_data.append({
                'Condition': condition,
                'Topic_ID': row['Topic'],
                'Count': row['Count'],
                'Top_Words': ', '.join([word for word, _ in model.get_topic(row['Topic'])[:5]])
            })

comparison_df = pd.DataFrame(comparison_data)
comparison_df.to_csv(f'{save_dir}/topic_comparison.csv', index=False)

print("\n✓ Topic comparison saved!")
print(f"\nSample of topics by condition:")
print(comparison_df.groupby('Condition').head(3))


TOPIC COMPARISON ACROSS CONDITIONS

✓ Topic comparison saved!

Sample of topics by condition:
          Condition  Topic_ID  Count  \
0              PTSD         0    141   
1              PTSD         1    115   
2              PTSD         2    105   
13           Stress         0    940   
14           Stress         1     23   
15           Stress         2     19   
16       Loneliness         0    187   
17       Loneliness         1     39   
18       Loneliness         2     38   
32             ADHD         0    147   
33             ADHD         1     55   
34             ADHD         2     52   
51          Anxiety         0     61   
52          Anxiety         1     58   
53          Anxiety         2     47   
72    Schizophrenia         0    478   
73    Schizophrenia         1    136   
74    Schizophrenia         2     40   
84              OCD         0     62   
85              OCD         1     59   
86              OCD         2     55   
113        Suicidal      

In [None]:
# SECTION 10: EXTRACT TOP KEYWORDS PER CONDITION

In [None]:
print("\n" + "="*60)
print("TOP KEYWORDS BY CONDITION")
print("="*60)

keywords_by_condition = {}

for condition, model in condition_models.items():
    all_keywords = []
    topics = [t for t in model.get_topics().keys() if t != -1]

    for topic_id in topics:
        topic_words = model.get_topic(topic_id)
        all_keywords.extend([word for word, score in topic_words[:10]])

    # Count frequency
    keyword_counts = pd.Series(all_keywords).value_counts().head(20)
    keywords_by_condition[condition] = keyword_counts

    print(f"\n{condition} - Top 10 keywords:")
    print(keyword_counts.head(10))

# Save keywords
with open(f'{save_dir}/top_keywords_by_condition.txt', 'w') as f:
    for condition, keywords in keywords_by_condition.items():
        f.write(f"\n{'='*60}\n")
        f.write(f"{condition}\n")
        f.write(f"{'='*60}\n")
        for word, count in keywords.head(20).items():
            f.write(f"{word}: {count}\n")

print("\n✓ Keywords extracted and saved!")


TOP KEYWORDS BY CONDITION

PTSD - Top 10 keywords:
trauma        5
ptsd          4
abuse         3
any           2
survive       2
memory        2
relate        2
flashbacks    2
raped         1
sex           1
Name: count, dtype: int64

Stress - Top 10 keywords:
stress        2
sleep         2
health        2
relaxation    2
any           1
stressed      1
down          1
mind          1
stressful     1
anxiety       1
Name: count, dtype: int64

Loneliness - Top 10 keywords:
lonely          5
loneliness      4
social          3
any             3
care            3
sad             3
happy           3
relationship    3
hug             2
excited         2
Name: count, dtype: int64

ADHD - Top 10 keywords:
adhd         7
diagnosed    3
thoughts     3
brain        3
school       3
talking      3
thats        2
adderall     2
phone        2
fun          2
Name: count, dtype: int64

Anxiety - Top 10 keywords:
anxiety    7
anxious    3
panic      3
phone      2
fear       2
away       2
drivi

In [None]:
# FINAL SUMMARY

In [None]:
print("\n" + "="*60)
print("ANALYSIS COMPLETE!")
print("="*60)
print(f"\nAll results saved to: {save_dir}")
print("\nFiles created:")
print("  1. cleaned_data.csv - Your cleaned dataset")
print("  2. bertopic_model_all_conditions/ - Overall topic model")
print("  3. bertopic_model_[condition]/ - Individual condition models")
print("  4. topic_comparison.csv - Topics across all conditions")
print("  5. top_keywords_by_condition.txt - Key terms per condition")
print("  6. Various .html visualization files")
print("\n✓ You can now close this notebook and come back anytime!")
print("  Just re-run Sections 1-2 to reconnect, then load saved models.")


ANALYSIS COMPLETE!

All results saved to: /content/drive/MyDrive/MentalHealthResearch

Files created:
  1. cleaned_data.csv - Your cleaned dataset
  2. bertopic_model_all_conditions/ - Overall topic model
  3. bertopic_model_[condition]/ - Individual condition models
  4. topic_comparison.csv - Topics across all conditions
  5. top_keywords_by_condition.txt - Key terms per condition
  6. Various .html visualization files

✓ You can now close this notebook and come back anytime!
  Just re-run Sections 1-2 to reconnect, then load saved models.


In [None]:
# VISUALIZATIONS FOR CONDITION-SPECIFIC TOPICS (Sections 8 & 9)
# Create charts to understand your topic modeling results

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

In [None]:
# VISUALIZATION 1: Number of Topics per Condition
# Shows topic diversity across mental health conditions

In [None]:
print("Creating Visualization 1: Topic Diversity by Condition...")

# Prepare data
topic_counts = []
for condition, results in condition_results.items():
    topic_counts.append({
        'Condition': condition,
        'Number_of_Topics': results['n_topics']
    })

df_topic_counts = pd.DataFrame(topic_counts)
df_topic_counts = df_topic_counts.sort_values('Number_of_Topics', ascending=True)

# Create horizontal bar chart
fig1 = px.bar(
    df_topic_counts,
    y='Condition',
    x='Number_of_Topics',
    orientation='h',
    title='Topic Diversity by Mental Health Condition',
    labels={'Number_of_Topics': 'Number of Topics', 'Condition': 'Mental Health Condition'},
    color='Number_of_Topics',
    color_continuous_scale='Viridis',
    text='Number_of_Topics'
)

fig1.update_traces(textposition='outside')
fig1.update_layout(
    height=600,
    showlegend=False,
    xaxis_title="Number of Distinct Topics",
    yaxis_title="",
    font=dict(size=12)
)

# Save
fig1.write_html(f'{save_dir}/viz_topic_diversity.html')
fig1.show()
print(f"✓ Saved: viz_topic_diversity.html")

Creating Visualization 1: Topic Diversity by Condition...


✓ Saved: viz_topic_diversity.html


In [None]:
# VISUALIZATION 2: Topic Size Distribution
# Shows how posts are distributed across topics for each condition

In [None]:
print("\nCreating Visualization 2: Topic Size Distribution...")

# Create subplots for top conditions
top_conditions = df_topic_counts.nlargest(6, 'Number_of_Topics')['Condition'].tolist()

fig2 = make_subplots(
    rows=2, cols=3,
    subplot_titles=top_conditions,
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

row, col = 1, 1
for condition in top_conditions:
    model = condition_models[condition]
    topic_info = model.get_topic_info()
    topic_info = topic_info[topic_info['Topic'] != -1]  # Remove outliers

    fig2.add_trace(
        go.Bar(
            x=topic_info['Topic'],
            y=topic_info['Count'],
            name=condition,
            showlegend=False,
            marker_color='steelblue'
        ),
        row=row, col=col
    )

    col += 1
    if col > 3:
        col = 1
        row += 1

fig2.update_layout(
    title_text="Topic Size Distribution for Most Diverse Conditions",
    height=800,
    showlegend=False
)
fig2.update_xaxes(title_text="Topic ID")
fig2.update_yaxes(title_text="Number of Posts")

fig2.write_html(f'{save_dir}/viz_topic_size_distribution.html')
fig2.show()
print(f"✓ Saved: viz_topic_size_distribution.html")



Creating Visualization 2: Topic Size Distribution...


✓ Saved: viz_topic_size_distribution.html


In [None]:
# VISUALIZATION 3: Keyword Frequency Heatmap
# Compare top keywords across conditions

In [None]:
print("\nCreating Visualization 3: Keyword Comparison Heatmap...")

# Load the comparison data
comparison_df = pd.read_csv(f'{save_dir}/topic_comparison.csv')

# Extract top keywords for each condition
keyword_matrix = []
conditions_list = comparison_df['Condition'].unique()

# Get all unique words across all conditions
all_words = set()
for condition in conditions_list:
    cond_data = comparison_df[comparison_df['Condition'] == condition]
    for words in cond_data['Top_Words']:
        all_words.update(words.split(', '))

# Count top 20 most common words
word_counts = {}
for word in all_words:
    count = 0
    for words_str in comparison_df['Top_Words']:
        if word in words_str:
            count += 1
    word_counts[word] = count

top_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:20]
top_words = [word for word, _ in top_words]

# Create matrix
matrix_data = []
for condition in conditions_list:
    row = []
    cond_data = comparison_df[comparison_df['Condition'] == condition]
    all_cond_words = ' '.join(cond_data['Top_Words'].tolist())

    for word in top_words:
        count = all_cond_words.count(word)
        row.append(count)
    matrix_data.append(row)

# Create heatmap
fig3 = go.Figure(data=go.Heatmap(
    z=matrix_data,
    x=top_words,
    y=conditions_list,
    colorscale='YlOrRd',
    text=matrix_data,
    texttemplate='%{text}',
    textfont={"size": 10},
    colorbar=dict(title="Frequency")
))

fig3.update_layout(
    title='Top Keywords Across Mental Health Conditions',
    xaxis_title='Keywords',
    yaxis_title='Condition',
    height=700,
    width=1200
)

fig3.write_html(f'{save_dir}/viz_keyword_heatmap.html')
fig3.show()
print(f"✓ Saved: viz_keyword_heatmap.html")


Creating Visualization 3: Keyword Comparison Heatmap...


✓ Saved: viz_keyword_heatmap.html


In [None]:
# VISUALIZATION 4: Topic Distribution Pie Charts
# Show topic proportions for selected conditions

In [None]:
print("\nCreating Visualization 4: Topic Distribution Pie Charts...")

# Select 4 interesting conditions to compare
selected_conditions = ['Depression', 'Anxiety', 'ADHD', 'PTSD']
available = [c for c in selected_conditions if c in condition_models]

if len(available) >= 4:
    fig4 = make_subplots(
        rows=2, cols=2,
        specs=[[{'type':'pie'}, {'type':'pie'}],
               [{'type':'pie'}, {'type':'pie'}]],
        subplot_titles=available[:4]
    )

    positions = [(1,1), (1,2), (2,1), (2,2)]

    for idx, condition in enumerate(available[:4]):
        model = condition_models[condition]
        topic_info = model.get_topic_info()
        topic_info = topic_info[topic_info['Topic'] != -1].head(8)  # Top 8 topics

        labels = [f"Topic {t}" for t in topic_info['Topic']]
        values = topic_info['Count'].tolist()

        row, col = positions[idx]
        fig4.add_trace(
            go.Pie(labels=labels, values=values, name=condition),
            row=row, col=col
        )

    fig4.update_layout(
        title_text="Topic Distribution Comparison (Top Topics Only)",
        height=800,
        showlegend=True
    )

    fig4.write_html(f'{save_dir}/viz_topic_distribution_pies.html')
    fig4.show()
    print(f"✓ Saved: viz_topic_distribution_pies.html")
else:
    print(f"⚠️ Not enough conditions available for pie charts")


Creating Visualization 4: Topic Distribution Pie Charts...


✓ Saved: viz_topic_distribution_pies.html


In [None]:
# VISUALIZATION 5: Outlier Analysis
# Compare outlier rates across conditions

In [None]:
print("\nCreating Visualization 5: Outlier Analysis...")

outlier_data = []
for condition, results in condition_results.items():
    topics = results['topics']
    total = len(topics)
    outliers = sum(1 for t in topics if t == -1)
    outlier_pct = (outliers / total) * 100

    outlier_data.append({
        'Condition': condition,
        'Outlier_Percentage': outlier_pct,
        'Outliers': outliers,
        'Total': total
    })

df_outliers = pd.DataFrame(outlier_data)
df_outliers = df_outliers.sort_values('Outlier_Percentage', ascending=False)

fig5 = px.bar(
    df_outliers,
    x='Condition',
    y='Outlier_Percentage',
    title='Outlier Rate by Condition (Posts That Don\'t Fit Any Topic)',
    labels={'Outlier_Percentage': 'Outlier Percentage (%)', 'Condition': 'Mental Health Condition'},
    color='Outlier_Percentage',
    color_continuous_scale='Reds',
    text='Outlier_Percentage'
)

fig5.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig5.update_layout(
    height=600,
    showlegend=False,
    xaxis_tickangle=-45
)

fig5.write_html(f'{save_dir}/viz_outlier_analysis.html')
fig5.show()
print(f"✓ Saved: viz_outlier_analysis.html")


Creating Visualization 5: Outlier Analysis...


✓ Saved: viz_outlier_analysis.html


In [None]:
# VISUALIZATION 6: Word Cloud Style - Top Keywords Summary

In [None]:
print("\nCreating Visualization 6: Top Keywords Summary...")

# Create a summary table visualization
summary_data = []
for condition in condition_results.keys():
    model = condition_models[condition]

    # Get top 5 keywords across all topics
    all_words = []
    for topic_id in model.get_topics():
        if topic_id != -1:
            words = model.get_topic(topic_id)[:3]
            all_words.extend([word for word, score in words])

    # Count frequency
    word_freq = pd.Series(all_words).value_counts().head(5)
    top_keywords = ', '.join(word_freq.index.tolist())

    summary_data.append({
        'Condition': condition,
        'Topics': condition_results[condition]['n_topics'],
        'Top_Keywords': top_keywords
    })

df_summary = pd.DataFrame(summary_data)
df_summary = df_summary.sort_values('Topics', ascending=False)

# Create table visualization
fig6 = go.Figure(data=[go.Table(
    header=dict(
        values=['<b>Condition</b>', '<b># Topics</b>', '<b>Characteristic Keywords</b>'],
        fill_color='steelblue',
        align='left',
        font=dict(color='white', size=14)
    ),
    cells=dict(
        values=[df_summary['Condition'],
                df_summary['Topics'],
                df_summary['Top_Keywords']],
        fill_color='lavender',
        align='left',
        font=dict(size=12),
        height=30
    )
)])

fig6.update_layout(
    title='Summary: Topics and Keywords by Condition',
    height=600,
    width=1000
)

fig6.write_html(f'{save_dir}/viz_keywords_summary_table.html')
fig6.show()
print(f"✓ Saved: viz_keywords_summary_table.html")


Creating Visualization 6: Top Keywords Summary...


✓ Saved: viz_keywords_summary_table.html


In [None]:
# SUMMARY

In [None]:
print("\n" + "="*70)
print("VISUALIZATION COMPLETE!")
print("="*70)
print(f"\nAll visualizations saved to: {save_dir}")
print("\nFiles created:")
print("  1. viz_topic_diversity.html - Compare number of topics per condition")
print("  2. viz_topic_size_distribution.html - Topic sizes for diverse conditions")
print("  3. viz_keyword_heatmap.html - Keyword frequency across conditions")
print("  4. viz_topic_distribution_pies.html - Topic proportions comparison")
print("  5. viz_outlier_analysis.html - Outlier rates by condition")
print("  6. viz_keywords_summary_table.html - Summary table of all findings")
print("\n✓ Open these HTML files in your browser to explore interactively!")
print("="*70)


VISUALIZATION COMPLETE!


NameError: name 'save_dir' is not defined

In [None]:
# ============================================================================
# GET ACTUAL KEYWORDS AND SAMPLE POSTS FOR ALL CONDITIONS
# ============================================================================

import pandas as pd

all_findings = []

for condition, model in condition_models.items():
    print(f"\n{'='*70}")
    print(f"{condition.upper()}")
    print(f"{'='*70}")

    topic_info = model.get_topic_info()

    # Get top 3 topics (excluding outliers)
    main_topics = topic_info[topic_info['Topic'] != -1].head(3)

    for idx, row in main_topics.iterrows():
        topic_id = row['Topic']
        count = row['Count']

        # Get keywords
        keywords = model.get_topic(topic_id)[:8]
        keyword_list = [word for word, score in keywords]

        print(f"\nTopic {topic_id} ({count} posts):")
        print(f"  Keywords: {', '.join(keyword_list)}")

        # Get representative documents
        rep_docs = model.get_representative_docs(topic_id)
        if rep_docs and len(rep_docs) > 0:
            print(f"  Example: {rep_docs[0][:200]}...")

        # Store for analysis
        all_findings.append({
            'Condition': condition,
            'Topic': topic_id,
            'Count': count,
            'Keywords': keyword_list,
            'Sample': rep_docs[0][:200] if rep_docs else ""
        })

# Save to CSV for analysis
findings_df = pd.DataFrame(all_findings)
findings_df.to_csv(f'{save_dir}/all_condition_topics.csv', index=False)
print("\n✓ Saved all findings to all_condition_topics.csv")


PTSD

Topic 0 (928 posts):
  Keywords: ptsd, trauma, any, down, experience, away, thats, night
  Example: Is it possible to forget a trauma experience but have PTSD? Just as the title says. To preface, I AM diagnosed with PTSD and do have a set of trauma experiences I can recall vividly and experience PTS...

Topic 1 (51 posts):
  Keywords: fireworks, professionals, medical professionals, medical, flee, reason, survival, environment
  Example: ALL medical professionals should be required extensive training on PTSD Just a rant. Its long. Malware is software written specifically to harm and infect the host system. PTSD is the malware infectin...

Topic 2 (21 posts):
  Keywords: suicide, policy, rptsd, suicidal, crisis, worried, negative, serious
  Example: rptsd suicide  serious selfharm policy Hey rptsd community, I hope youre doing at least okay and that you had a good new year, be it with friends, with family, if youre isolating or just having me tim...

STRESS

Topic 0 (969 posts):


In [None]:
# ============================================================================
# SEARCH FOR SPECIFIC THEMES ACROSS CONDITIONS
# ============================================================================

# Search for "sleep" related keywords
sleep_keywords = ['sleep', 'sleeping', 'insomnia', 'nightmare', 'nightmares',
                  'dream', 'dreams', 'tired', 'exhausted', 'rest']

# Search for "work" related keywords
work_keywords = ['work', 'working', 'job', 'jobs', 'career', 'boss',
                'office', 'deadline', 'project', 'workplace']

# Search for "relationship" keywords
relationship_keywords = ['relationship', 'relationships', 'partner', 'boyfriend',
                        'girlfriend', 'spouse', 'marriage', 'dating', 'breakup',
                        'friends', 'friendship', 'family', 'alone', 'lonely']

def find_theme_across_conditions(theme_name, keywords_to_find):
    print(f"\n{'='*70}")
    print(f"SEARCHING FOR '{theme_name}' ACROSS CONDITIONS")
    print(f"{'='*70}")

    for condition, model in condition_models.items():
        # Get all topics
        topic_info = model.get_topic_info()

        found_topics = []

        for topic_id in topic_info['Topic']:
            if topic_id != -1:
                # Get topic keywords
                topic_keywords = [word for word, score in model.get_topic(topic_id)]

                # Check if any theme keywords appear
                matching = [kw for kw in keywords_to_find if kw in topic_keywords]

                if matching:
                    found_topics.append({
                        'topic': topic_id,
                        'matching_words': matching,
                        'all_keywords': topic_keywords[:8]
                    })

        if found_topics:
            print(f"\n{condition}:")
            for ft in found_topics:
                print(f"  Topic {ft['topic']}: {', '.join(ft['all_keywords'])}")
                print(f"    → Contains: {', '.join(ft['matching_words'])}")

# Run searches
find_theme_across_conditions("SLEEP", sleep_keywords)
find_theme_across_conditions("WORK", work_keywords)
find_theme_across_conditions("RELATIONSHIPS", relationship_keywords)


SEARCHING FOR 'SLEEP' ACROSS CONDITIONS

Stress:
  Topic 0: stress, anxiety, any, sleep, down, mind, stressful, health
    → Contains: sleep

ADHD:
  Topic 0: tasks, hours, review, tired, bed, phone, productive, habits
    → Contains: tired

Anxiety:
  Topic 2: wake, bed, anxious, sleep, morning, amygdala, anxiety, anxiety physical
    → Contains: sleep

Schizophrenia:
  Topic 0: schizophrenia, art, psychosis, tired, hallucinations, schizophrenic, symptoms, diet
    → Contains: tired
  Topic 5: nice, improvement, supportive, hospital, sleep, shower, community, recently
    → Contains: sleep
  Topic 12: happy sunday, sunday, treat, sleeping, lazy, cat happy, movies, upcoming
    → Contains: sleeping

OCD:
  Topic 17: secretly, strong, controlled, deal, covid, planning, anybody, tired
    → Contains: tired
  Topic 22: bed, sleep, night, nights, obsessed, perform compulsions, perform, toll
    → Contains: sleep

Suicidal:
  Topic 2: suicidal, permanent, temporary, suicide, permanent solu

In [None]:
# 1. Generate the topics per class data using your specific model name
# Changed 'topic_model' to 'topic_model_all' to match your initialization
topics_per_class = topic_model_all.topics_per_class(docs, classes=classes)

# 2. Visualize the comparison
# We filter for 'Normal' and a few clinical conditions to see the difference clearly
fig = topic_model_all.visualize_topics_per_class(
    topics_per_class,
    classes=["Normal", "Addiction", "Depression", "ADHD"],
    top_n_topics=10
)

# 3. Show the interactive chart
fig.show()

In [None]:
# 1. Pull the specific model for the 'Normal' condition from your dictionary
# Based on image_22df70, your models are stored in 'condition_models'
normal_model = condition_models['Normal']

# 2. Get the topic information for the Normal group
normal_info = normal_model.get_topic_info()

# 3. Visualize the top words for the Normal condition specifically
# This will give you a chart just like image_ee2da9 but for Normal posts
fig = normal_model.visualize_barchart(top_n_topics=10, title="Normal Condition: Topic Word Scores")
fig.show()

In [None]:
# 1. Select the models you want to compare from your dictionary
normal_model = condition_models['Normal']
depression_model = condition_models['Depression']
addiction_model = condition_models['Addiction']

# 2. Generate barcharts for each to compare them visually
# We use the same 'top_n_topics' so the scale is fair
print("--- NORMAL CONDITION TOPICS ---")
normal_model.visualize_barchart(top_n_topics=8).show()

print("--- DEPRESSION CONDITION TOPICS ---")
depression_model.visualize_barchart(top_n_topics=8).show()

--- NORMAL CONDITION TOPICS ---


--- DEPRESSION CONDITION TOPICS ---
