# Install Pacakge and Dependency

In [None]:
## Run once when you first open this script
!pip install emoji
!pip install nltk
!pip install pyLDAvis
!pip install sentence-transformers scikit-learn pandas
!pip install gensim
!pip install huggingface_hub

In [None]:
import re
import emoji
from emoji import demojize
from pprint import pprint

import nltk

import os
import pandas as pd
import numpy as np

import spacy
from spacy.util import compile_infix_regex
from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from huggingface_hub import snapshot_download


# modeling
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load Dataset


### Download Datasets

In [None]:
nltk.download('stopwords')

# Download load comments datasets.
dataset_path = snapshot_download(
    repo_id="Dragmoon/2025CalifoniaWildfire",
    repo_type="dataset",
    local_dir="./datasets"
)


### Load Datasets

In [None]:
# view sample from the final posts with all labels
posts_df = pd.read_csv(os.path.join(dataset_path,'reddit/all_final_posts_multiple_label.csv'))
posts_df.head(5)

In [None]:
# check the distribution of word counts for posts
posts_df['word_count'] = posts_df['Clean Text'].apply(lambda x: len(str(x).split()))
posts_df['word_count'].describe()

In [None]:
nlp = spacy.load("en_core_web_sm")

# custom tokenizer — overriding how it splits tokens in the middle of strings
def custom_tokenizer(nlp):
    infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])", # Split on arithmetic operators when they appear between numbers
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES # "U.S.A.", "Dr.Smith" control if the dot stays attached or becomes a split point
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), # split on commas when they are between letters, "word,another" → "word", ",", "another"
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), # Punctuation like : < > = / between letters or digits
        ]
    )

    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)


nlp.tokenizer = custom_tokenizer(nlp)

In [None]:
# only keep noun, adjectives, verbs, and adverbs
def clean_review(review, output_list = True, postags = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
    result = []
    doc = nlp(review)
    for sent in doc.sents:
        output = [token.lemma_.lower() for token in sent if (token.pos_ in postags) and (nlp.vocab[token.text].is_stop == False)]
        result = result + output

    if output_list:
        return result
    return ' '.join(result)

In [None]:
posts_df['lemma_text'] = posts_df['Clean Text'].apply(lambda x: clean_review(x) if isinstance(x, str) else x)
posts_df[['Clean Text','lemma_text']] # compare clean text and lemmatize text

In [None]:
# Update stopwords to exclude comman words in the wildfire events
from nltk.corpus import stopwords

word_list = ['hughes', 'wildfire', 'fire', 'fires', 'la', 'california', 'angeles',
             'los', 'the', 'to', 'it', '*', '%', 'am', 'pm', 'pasadena','eaton',
             'palisades', 'altadena', '8th', 'l.a.']

docs = posts_df['lemma_text'].to_list()

stop_words = set(stopwords.words('english'))
stop_words.update(word_list)

clean_docs = [
    [word for word in doc if word not in stop_words]
    for doc in docs
]

In [None]:
# Build id2word and corpus for LDA model
id2word = corpora.Dictionary(clean_docs)  #remove stopwords
corpus = [id2word.doc2bow(text) for text in clean_docs]

# LDA

Change the randome state will lead to differnt optmized topic numbers
e.g: 188-> 6, 42-24

In [None]:
# tune K to find which one has the higest coherence score
def calculate_coherence_score(n):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=n,
                                                passes=5,             # increase passes to allow more complete training
                                                iterations=100,
                                                chunksize=50,
                                                random_state=74)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_docs, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    return coherence_lda

topic_nums = list(np.arange(5, 30 + 1, 1))
coherence_scores = []
best_n = 0
best_score = 0
for n in topic_nums:
    coherence_score = calculate_coherence_score(n)
    coherence_scores.append(coherence_score)
    if coherence_score > best_score:
      best_score = coherence_score
      best_n = n
    print(f"n : {n} ;  Score : {coherence_score}")

print(best_n)

In [None]:
# use the best number of topic to reproduce the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=best_n,
                                                passes=5,             # increase passes to allow more complete training
                                                iterations=100,
                                                chunksize=50,
                                                random_state=74)

In [None]:
# check weights for keywords in each latent topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# check each topics
topics = []
for index, topic in lda_model.show_topics(formatted=False, num_words= 20, num_topics=best_n):
    print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))
    topics.append([w[0] for w in topic])

In [None]:
# print('\nPerplexity: ', lda_model.log_perplexity(corpus)) #optional for perplexity if needed

# verify the coherence value
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

The seed topic list below is determined by latent topics above and human reviewers on the content of posts

In [None]:
seed_topic_list = [["watchduty", "calfire", "containment", "drone", "images", "active", "inmate", "wind", "spread", "superscoopers"],
                   ["air quality", "evacuate", "school", "ash", "smoke", "safety", "health", "selfies", "power", "medical"],
                   ["water", "temporary", "mask", "pump", "rental", "housing", "eggs", "hydrant", "food", "laundry"],
                   ["insurance", "law", "community", "relief", "donation", "restore", "clean", "mental", "rebuilding", "benefit"],
                   ["burned down", "gone", "damage", "structures", "survived", "cars", "destruction", "trails", "victim", "lost"],
                   ["responsibility", "pro bono", "influencer", "twitter", "trump", "mayor", "concert", "volunteer", "therapy", "celebrity"]
                   ]