# Install Pacakge and Dependency

In [None]:
## Run once when you first open this script
!pip install emoji
!pip install nltk
!pip install pyLDAvis
!pip install sentence-transformers scikit-learn pandas
!pip install gensim
!pip install huggingface_hub

In [2]:
import re
import emoji
from emoji import demojize
from pprint import pprint

import nltk

import os
import pandas as pd
import numpy as np

import spacy
from spacy.util import compile_infix_regex
from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from huggingface_hub import snapshot_download


# modeling
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load Dataset


### Download Datasets

In [None]:
nltk.download('stopwords')

# Download load comments datasets.
dataset_path = snapshot_download(
    repo_id="Dragmoon/2025CalifoniaWildfire",
    repo_type="dataset",
    local_dir="./datasets"
)


### Load Datasets

In [4]:
# view sample from the final posts with all labels
posts_df = pd.read_csv(os.path.join(dataset_path,'reddit/all_final_posts_multiple_label.csv'))
posts_df.head(5)

Unnamed: 0,post_id,Subreddit,author_id,Title,Score,Date,Number of Comments,Body,Text,Clean Text,Situational Awareness,Crisis Narrative,Grief,Mental,Equity,Notes
0,32bb0f842416be53,OmnibusCollectors,35932f26cc82a719,I need to vent.,683,2025-01-16 04:06:40,79,"Hey Folks, apologies in advance for airing my ...","I need to vent. Hey Folks, apologies in advanc...","I need to vent. Hey folks, apologies in advanc...",Loss and damage,Victim,checked,checked,,deep greif
1,9825fceb054651d1,LosAngeles,50a9f2a4f8943598,Elevated Lead + Chlorine Levels during LA Wild...,417,2025-01-20 17:55:53,120,Is anyone else spiraling from this article in ...,Elevated Lead + Chlorine Levels during LA Wild...,Elevated Lead and Chlorine Levels During LA Wi...,"Public health and safety,Influential figures","Victim,Blame",checked,checked,checked,environment health
2,476b776686279ea0,pasadena,a1cd563df83e6f3b,Does anyone feel like they have PTSD from the ...,330,2025-02-05 05:47:30,107,"Hey all, I've been having some feelings of che...",Does anyone feel like they have PTSD from the ...,Does anyone feel like they have PTSD from the ...,Public health and safety,Victim,checked,checked,,"mental health toll of the disaster, which may ..."
3,46d1241fb7e616cd,pasadena,7885d5b2862fa299,My home burned down in the Eaton Fire,1152,2025-01-10 10:54:44,89,We lost everything in the Eaton Fire on Tuesda...,My home burned down in the Eaton Fire We lost ...,My home burned down in the Eaton Fire. We lost...,Loss and damage,Victim,checked,checked,,
4,4832a2d387320f76,capricorns,52b3ec942e17a496,"It’s my birthday, and all I want to do is cry",804,2025-01-13 00:59:54,118,32. From evacuating my boyfriends place due to...,"It’s my birthday, and all I want to do is cry ...","It's my birthday, and all I want to do is cry....","Loss and damage,Public health and safety",Victim,checked,checked,,


In [5]:
# check the distribution of word counts for posts
posts_df['word_count'] = posts_df['Clean Text'].apply(lambda x: len(str(x).split()))
posts_df['word_count'].describe()

Unnamed: 0,word_count
count,373.0
mean,119.563003
std,158.216095
min,4.0
25%,16.0
50%,53.0
75%,163.0
max,1025.0


In [6]:
nlp = spacy.load("en_core_web_sm")

# custom tokenizer — overriding how it splits tokens in the middle of strings
def custom_tokenizer(nlp):
    infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])", # Split on arithmetic operators when they appear between numbers
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES # "U.S.A.", "Dr.Smith" control if the dot stays attached or becomes a split point
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), # split on commas when they are between letters, "word,another" → "word", ",", "another"
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), # Punctuation like : < > = / between letters or digits
        ]
    )

    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)


nlp.tokenizer = custom_tokenizer(nlp)

In [7]:
# only keep noun, adjectives, verbs, and adverbs
def clean_review(review, output_list = True, postags = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
    result = []
    doc = nlp(review)
    for sent in doc.sents:
        output = [token.lemma_.lower() for token in sent if (token.pos_ in postags) and (nlp.vocab[token.text].is_stop == False)]
        result = result + output

    if output_list:
        return result
    return ' '.join(result)

In [8]:
posts_df['lemma_text'] = posts_df['Clean Text'].apply(lambda x: clean_review(x) if isinstance(x, str) else x)
posts_df[['Clean Text','lemma_text']] # compare clean text and lemmatize text

Unnamed: 0,Clean Text,lemma_text
0,"I need to vent. Hey folks, apologies in advanc...","[need, vent, folk, apology, advance, air, grie..."
1,Elevated Lead and Chlorine Levels During LA Wi...,"[elevated, lead, chlorine, levels, la, wildfir..."
2,Does anyone feel like they have PTSD from the ...,"[feel, ptsd, fire, have, feeling, chest, tight..."
3,My home burned down in the Eaton Fire. We lost...,"[home, burn, eaton, fire, lose, eaton, fire, t..."
4,"It's my birthday, and all I want to do is cry....","[birthday, want, cry, evacuate, boyfriend, pla..."
...,...,...
368,My wife and I lost our house in the Pacific Pa...,"[wife, lose, house, pacific, palisades, fire, ..."
369,It's so obvious when people exploit tragedies ...,"[obvious, people, exploit, tragedy, boost, res..."
370,Attorney General Rob Bonta to address price go...,"[attorney, general, rob, bonta, address, price..."
371,Free therapy services and mental health services.,"[free, therapy, service, mental, health, service]"


In [9]:
# Update stopwords to exclude comman words in the wildfire events
from nltk.corpus import stopwords

word_list = ['hughes', 'wildfire', 'fire', 'fires', 'la', 'california', 'angeles',
             'los', 'the', 'to', 'it', '*', '%', 'am', 'pm', 'pasadena','eaton',
             'palisades', 'altadena', '8th', 'l.a.']

docs = posts_df['lemma_text'].to_list()

stop_words = set(stopwords.words('english'))
stop_words.update(word_list)

clean_docs = [
    [word for word in doc if word not in stop_words]
    for doc in docs
]

In [10]:
# Build id2word and corpus for LDA model
id2word = corpora.Dictionary(clean_docs)  #remove stopwords
corpus = [id2word.doc2bow(text) for text in clean_docs]

# LDA

Change the randome state will lead to differnt optmized topic numbers
e.g: 188-> 6, 42-24

In [11]:
# tune K to find which one has the higest coherence score
def calculate_coherence_score(n):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=n,
                                                passes=5,             # increase passes to allow more complete training
                                                iterations=100,
                                                chunksize=50,
                                                random_state=74)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_docs, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    return coherence_lda

topic_nums = list(np.arange(5, 30 + 1, 1))
coherence_scores = []
best_n = 0
best_score = 0
for n in topic_nums:
    coherence_score = calculate_coherence_score(n)
    coherence_scores.append(coherence_score)
    if coherence_score > best_score:
      best_score = coherence_score
      best_n = n
    print(f"n : {n} ;  Score : {coherence_score}")

print(best_n)

n : 5 ;  Score : 0.4238695530404587
n : 6 ;  Score : 0.44283045737445176
n : 7 ;  Score : 0.44513040287544353
n : 8 ;  Score : 0.4696847755326385
n : 9 ;  Score : 0.4916229017726637
n : 10 ;  Score : 0.45418351459791434
n : 11 ;  Score : 0.4633664548432336
n : 12 ;  Score : 0.4701045858930942
n : 13 ;  Score : 0.40923628394465006
n : 14 ;  Score : 0.42815204790977657
n : 15 ;  Score : 0.4108291458571635
n : 16 ;  Score : 0.45018124510571783
n : 17 ;  Score : 0.46693265967171316
n : 18 ;  Score : 0.4427343296579029
n : 19 ;  Score : 0.41025418498337696
n : 20 ;  Score : 0.44281565359999703
n : 21 ;  Score : 0.41391058046182216
n : 22 ;  Score : 0.4018494516645104
n : 23 ;  Score : 0.4149103431530753
n : 24 ;  Score : 0.4019034460131788
n : 25 ;  Score : 0.4225071525960664
n : 26 ;  Score : 0.4504779486261553
n : 27 ;  Score : 0.37970967994700716
n : 28 ;  Score : 0.41247802527608585
n : 29 ;  Score : 0.41457678381816765
n : 30 ;  Score : 0.4228474487788662
9


In [12]:
# use the best number of topic to reproduce the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=best_n,
                                                passes=5,             # increase passes to allow more complete training
                                                iterations=100,
                                                chunksize=50,
                                                random_state=74)

In [13]:
# check weights for keywords in each latent topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.019*"home" + 0.018*"significant" + 0.017*"leafs" + 0.015*"dog" + '
  '0.012*"girl" + 0.011*"lead" + 0.010*"emotional" + 0.010*"surround" + '
  '0.009*"trap" + 0.009*"able"'),
 (1,
  '0.035*"update" + 0.025*"available" + 0.020*"map" + 0.019*"emergency" + '
  '0.018*"information" + 0.013*"post" + 0.013*"include" + 0.013*"january" + '
  '0.012*"live" + 0.012*"air"'),
 (2,
  '0.023*"school" + 0.023*"provide" + 0.020*"resident" + 0.017*"support" + '
  '0.017*"help" + 0.016*"center" + 0.015*"community" + 0.014*"assistance" + '
  '0.013*"impact" + 0.013*"need"'),
 (3,
  '0.016*"lose" + 0.013*"area" + 0.011*"people" + 0.011*"house" + '
  '0.010*"church" + 0.009*"home" + 0.008*"park" + 0.008*"shop" + '
  '0.007*"evacuate" + 0.007*"new"'),
 (4,
  '0.027*"twitter" + 0.025*"destroy" + 0.024*"house" + 0.018*"mcnally" + '
  '0.013*"shitty" + 0.011*"image" + 0.010*"andrew" + 0.010*"depts" + '
  '0.009*"love" + 0.007*"steve"'),
 (5,
  '0.013*"madre" + 0.011*"acre" + 0.010*"lake" + 0.009*"aca

In [14]:
# check each topics
topics = []
for index, topic in lda_model.show_topics(formatted=False, num_words= 20, num_topics=best_n):
    print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))
    topics.append([w[0] for w in topic])

Topic: 0 
Words: ['home', 'significant', 'leafs', 'dog', 'girl', 'lead', 'emotional', 'surround', 'trap', 'able', 'grab', 'level', 'severity', 'dust', 'wife', 'stein', 'jersey', 'hockey', 'maple', 'recognizable']
Topic: 1 
Words: ['update', 'available', 'map', 'emergency', 'information', 'post', 'include', 'january', 'live', 'air', 'quality', 'official', 'water', 'affect', 'city', 'shelter', 'resource', 'area', 'offer', 'health']
Topic: 2 
Words: ['school', 'provide', 'resident', 'support', 'help', 'center', 'community', 'assistance', 'impact', 'need', 'county', 'insurance', 'donation', 'market', 'aid', 'people', 'recovery', 'class', 'effort', 'block']
Topic: 3 
Words: ['lose', 'area', 'people', 'house', 'church', 'home', 'park', 'shop', 'evacuate', 'new', 'want', 'elementary', 'know', 'pass', 'burn', 'state', 'high', 'friend', 'start', 'go']
Topic: 4 
Words: ['twitter', 'destroy', 'house', 'mcnally', 'shitty', 'image', 'andrew', 'depts', 'love', 'steve', 'publicly', 'know', 'away', 'm

In [15]:
# print('\nPerplexity: ', lda_model.log_perplexity(corpus)) #optional for perplexity if needed

# verify the coherence value
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4916229017726637


The seed topic list below is determined by latent topics above and human reviewers on the content of posts

In [16]:
seed_topic_list = [["watchduty", "calfire", "containment", "drone", "images", "active", "inmate", "wind", "spread", "superscoopers"],
                   ["air quality", "evacuate", "school", "ash", "smoke", "safety", "health", "selfies", "power", "medical"],
                   ["water", "temporary", "mask", "pump", "rental", "housing", "eggs", "hydrant", "food", "laundry"],
                   ["insurance", "law", "community", "relief", "donation", "restore", "clean", "mental", "rebuilding", "benefit"],
                   ["burned down", "gone", "damage", "structures", "survived", "cars", "destruction", "trails", "victim", "lost"],
                   ["responsibility", "pro bono", "influencer", "twitter", "trump", "mayor", "concert", "volunteer", "therapy", "celebrity"]
                   ]