# Data load and preprocessing

In [None]:
import os
import numpy as np
import pandas as pd
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from sklearn.model_selection import ParameterGrid

In [3]:
import kagglehub
import tqdm
# Download latest version
path = kagglehub.dataset_download("xinwangcs/stressor-cause-of-mental-health-problem-dataset")

FILES = []
for file in os.listdir(path):
    if file.endswith(".json"):
        PATH = os.path.join(path, file)
        FILES.append(PATH )


In [4]:
from nlprocess import load_data2dataframe

dataframes = [load_data2dataframe(f) for f in FILES]
df = pd.concat(dataframes, ignore_index=True)
df.head()

Unnamed: 0,text,stressor_class,stressor_word,interval
0,Why are there always trivial matters in life t...,T1,financial,8
1,"After celebrating my 21st birthday, I truly fe...",T1,urged to marry,8
2,"In the coming March, my work pressure will be ...",T1,work,8
3,"Indeed, the pressure of writing papers now is ...",T1,papers,8
4,"I can't learn it anymore, so let's just give u...",T1,can't learn it,8


In [5]:
df.dropna(inplace=True)
df.isna().sum()

text              0
stressor_class    0
stressor_word     0
interval          0
dtype: int64

In [6]:
from nlprocess import normalize_spacy
from nlprocess import to_corpus
import spacy
nlp = spacy.load("en_core_web_sm") 
Dict = to_corpus(df, "stressor_word", "bow" , nlp = nlp)

Normalizing: 100%|██████████| 3985/3985 [00:15<00:00, 263.11it/s]
Building BOW Corpus: 100%|██████████| 3885/3885 [00:00<00:00, 569471.97it/s]


# LDA Gridsearch


In [9]:
from tqdm import tqdm
ddf , texts ,dictionary ,corpus = Dict["df"], Dict["texts"], Dict["dictionary"], Dict["corpus"]
#定義參數網格
param_grid = {
     'num_topics' : [3 , 4],
     'alpha': np.arange(0.1 , 1.5 , 0.1),
     'eta': np.arange(0.1 , 1.5 , 0.1),
     
}

best_score = -1
best_params = None
best_model = None

for params in tqdm(list(ParameterGrid(param_grid)), desc="Grid Search"):
     lda = LdaModel(
          corpus = corpus,
          id2word = dictionary,
          num_topics = params["num_topics"],
          passes = 50,
          alpha = params['alpha'],
          eta = params['eta'],
          random_state = 42
     )
     cm = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v')
     score = cm.get_coherence()
     if score > best_score:
          best_score = score
          best_params = params
          best_model = lda

print("Best coherence score:", best_score)
print("Best params:", best_params)


Grid Search: 100%|██████████| 392/392 [2:06:30<00:00, 19.36s/it]  

Best coherence score: 0.7752085069724399
Best params: {'alpha': 0.30000000000000004, 'eta': 0.7000000000000001, 'num_topics': 4}





In [10]:
# 取得主題與描述
topics = best_model.print_topics()
topics

[(0,
  '0.108*"exam" + 0.087*"study" + 0.070*"job" + 0.038*"home" + 0.027*"go" + 0.017*"friend" + 0.017*"panic" + 0.015*"expectation" + 0.014*"brother" + 0.013*"project"'),
 (1,
  '0.077*"school" + 0.041*"entrance" + 0.040*"family" + 0.037*"mom" + 0.035*"start" + 0.033*"graduate" + 0.030*"year" + 0.028*"examination" + 0.027*"new" + 0.020*"high"'),
 (2,
  '0.026*"peer" + 0.025*"academic" + 0.022*"social" + 0.020*"people" + 0.019*"house" + 0.017*"partner" + 0.016*"interview" + 0.015*"buy" + 0.015*"second" + 0.015*"performance"'),
 (3,
  '0.326*"work" + 0.163*"life" + 0.028*"parent" + 0.026*"money" + 0.017*"marry" + 0.016*"paper" + 0.014*"sister" + 0.012*"teacher" + 0.010*"urge" + 0.009*"student"')]

In [11]:
import re
# 只擷取每個主題中的詞（不含機率）
topic_keywords = [re.findall(r'"(.*?)"', topic[1]) for topic in topics]

# 若想加上主題編號
topic_words = []
for idx, words in enumerate(topic_keywords):
    topic_words.append(words)

# 展平所有詞
all_words = [word for topic in topic_keywords for word in topic]
distinct_words = set(all_words)

kword = 10  # 每個主題取 10 個詞
Term = 4  # 共有 4 個主題
diversity = len(distinct_words) / (kword* Term)

print(f"Topic Diversity = {diversity:.3f}")


Topic Diversity = 1.000


# model save

In [12]:
#儲存模型
import os
os.makedirs("LDA", exist_ok=True)
best_model.save("LDA/topics_4_best_model.model")