## Install and Import Dependences

In [1]:
!pip install seaborn
!pip install "numpy<=2.1.0"
!pip install openai
!pip install bertopic
!pip install nltk
!pip install matplotlib
!pip install sentence-transformers scikit-learn pandas
!pip install gensim
!pip install huggingface_hub
!pip install datamapplot

Collecting bertopic
  Downloading bertopic-0.17.4-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.4-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.4
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Collecting datamapplot
  Downloading datamapplot-0.6.4-py3-none-any.whl.metadata (7.8 kB)
Collecting colorspacious>=1.1 (from datamapplot)
  Downloading colorspacious-1.1.2-py2.py3-none-any.whl.metadata (3.6 kB)
Collecting dask<2025.0.1,>=

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import re
from pprint import pprint

import nltk
from nltk.tokenize import sent_tokenize
from bertopic import BERTopic

import pandas as pd
import numpy as np
from datetime import datetime
from huggingface_hub import snapshot_download

# modeling
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

import matplotlib.pyplot as plt

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


## (Optional) Train BERTopic with OpenAI with the optimized hyperparameters from ModelSelect.ipyn

### Train BERTopic

The following outlines the process for training BERTopic using optimized hyperparameters derived from the Model Selection phase:
*   Steps 1–5: These steps are consistently reproducible across iterations.
*   Step 6: This step introduces random parameters and relies on OpenAI model output, which may result in inconsistent outcomes across iterations.
*   Alternative Approaches: Users may leverage alternative generative models to produce human-readable topic labels. For more information, please refer to https://maartengr.github.io/BERTopic/algorithm/algorithm.html#6-optional-fine-tune-topic-representation


In [None]:
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import openai
from bertopic.representation import OpenAI

# Optimized hyperparameters from Model Selection step.
N_NEIGHBORS = 30
MIN_DIST = 0.01
MIN_CLUSTER_SIZE = 300
MIN_SAMPLES = 150

# Seed topic list
seed_topic_list = [["watchduty", "calfire", "containment", "drone", "images", "active", "inmate", "wind", "spread", "superscoopers"],
                   ["air quality", "evacuate", "school", "ash", "smoke", "safety", "health", "selfies", "power", "medical"],
                   ["water", "temporary", "mask", "pump", "rental", "housing", "eggs", "hydrant", "food", "laundry"],
                   ["insurance", "law", "community", "relief", "donation", "restore", "clean", "mental", "rebuilding", "benefit"],
                   ["burned down", "gone", "damage", "structures", "survived", "cars", "destruction", "trails", "victim", "lost"],
                   ["responsibility", "pro bono", "influencer", "twitter", "trump", "mayor", "concert", "volunteer", "therapy", "celebrity"]
                   ]

#Step1
embedding_model = SentenceTransformer('all-mpnet-base-v2')

#Step2
umap_model = UMAP(n_neighbors=N_NEIGHBORS, n_components=5, min_dist=MIN_DIST, metric='cosine', random_state = 42)

#Step3
hdbscan_model = HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, min_samples=MIN_SAMPLES,
                        gen_min_span_tree=True,
                        prediction_data=True)

#Step4
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

#Step5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = {
    "MaxMargin": MaximalMarginalRelevance(diversity=0.3),
    "OpenAI": OpenAI(client, model="gpt-4o-mini", chat=True, prompt=label_prompt, nr_docs=5, delay_in_seconds=10),
    "Summary": OpenAI(client, model="gpt-4o-mini", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=10)
}

#Step6 - (Optional) Fine-tune topic representations with  a `bertopic.representation` model

# Create your representation model with OpenAI
client = openai.OpenAI(api_key="") # Add your own OpenAI API key here.

label_prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label with five to ten words in the following format:
topic: <topic label>
"""


summarization_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]
In this topic, the following documents are a small but representative subset of all documents in the topic:
[DOCUMENTS]

Based on the information above, please give a description of this topic in the following format:
topic: <description>
"""

representation_model = {
    "MaxMargin": MaximalMarginalRelevance(diversity=0.3),
    "OpenAI": OpenAI(client, model="gpt-4o-mini", chat=True, prompt=label_prompt, nr_docs=5, delay_in_seconds=10),
    "Summary": OpenAI(client, model="gpt-4o-mini", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=10)
}

model_ft = BERTopic(
    seed_topic_list=seed_topic_list,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
    representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)

topic_ft = model_ft.fit(corpus)
topic_ft.representation_model = None
save_file = "/content/models/opt_model_ft_v2"
topic_ft.save(save_file, serialization="pickle")

2025-03-28 00:14:37,218 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2413/2413 [00:41<00:00, 58.12it/s] 
2025-03-28 00:15:19,939 - BERTopic - Embedding - Completed ✓
2025-03-28 00:15:19,940 - BERTopic - Guided - Find embeddings highly related to seeded topics.
Batches: 100%|██████████| 1/1 [00:00<00:00, 154.87it/s]
2025-03-28 00:15:20,296 - BERTopic - Guided - Completed ✓
2025-03-28 00:15:20,297 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-28 00:17:40,086 - BERTopic - Dimensionality - Completed ✓
2025-03-28 00:17:40,089 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-28 00:17:55,659 - BERTopic - Cluster - Completed ✓
2025-03-28 00:17:55,671 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 31/31 [05:30<00:00, 10.66s/it]
100%|██████████| 31/31 [06:43<00:00, 13.03s/it]
2025-03-28 00:30:17,228 - BERTopic - Representation - Com

## Load datasets

In [4]:
import os
from huggingface_hub import snapshot_download

# Download load comments datasets.
dataset_path = snapshot_download(
    repo_id="Dragmoon/2025CalifoniaWildfire",
    repo_type="dataset",
    local_dir="./datasets"
)

comments = pd.read_csv(os.path.join(dataset_path, 'reddit/all_final_comments.csv'))
short_comments = comments[comments['corpus_length'] < 10]
long_comments = comments[comments['corpus_length'] >= 10]
corpus = long_comments['corpus'].to_list()
len(corpus)

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

77194

## Load Pretrained Model from Hugging Face

###Download Pretrained Optimized Model

In [5]:
# Download load comments datasets.
model_path = snapshot_download(
    repo_id="Dragmoon/BERTopicFire",
    local_dir="./models"
)

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

opt_model_ft_v2:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

ctfidf_config.json:   0%|          | 0.00/23.8M [00:00<?, ?B/s]

ctfidf.safetensors:   0%|          | 0.00/23.7M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

README.md:   0%|          | 0.00/33.0 [00:00<?, ?B/s]

safetensor/ctfidf.safetensors:   0%|          | 0.00/23.7M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

safetensor/ctfidf_config.json:   0%|          | 0.00/23.8M [00:00<?, ?B/s]

safetensor/topic_embeddings.safetensors:   0%|          | 0.00/95.3k [00:00<?, ?B/s]

topic_embeddings.safetensors:   0%|          | 0.00/95.3k [00:00<?, ?B/s]

topics.json: 0.00B [00:00, ?B/s]

topics.json: 0.00B [00:00, ?B/s]

### Load Optimized Model

In [6]:
from bertopic import BERTopic
model_ft = BERTopic.load(os.path.join(model_path, 'safetensor'))



## OpenAI-Finetuned Model output visualization

In [7]:
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in model_ft.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
model_ft.set_topic_labels(chatgpt_topic_labels)

In [8]:
long_comments['topic_id'] = model_ft.topics_
long_comments[long_comments['topic_id'] != -1].count()

Unnamed: 0,0
post_id,43609
comment_id,43609
author,43609
body,43609
score,43609
created_utc,43609
corpus,43609
corpus_length,43609
topic_id,43609


In [10]:
comments_df = pd.read_csv(os.path.join(dataset_path, 'reddit/all_final_comments_multiple_label.csv'))
comments_df.head()

Unnamed: 0,Topic,Count,MaxMargin,OpenAI,Summary,Representative_Docs,Situational Awareness,Crisis Narrative,Grief,Mental,Equity,Notes
0,18,796,"['evacuate', 'to evacuate', 'evacuation', 'pac...",['Preparing for Evacuation and Personal Safety...,['This topic focuses on the importance of prep...,['if you are thinking about leaving take photo...,"Public health and safety,Emergency resources",,,,,
1,3,3505,"['air', 'air quality', 'smoke', 'aqi', 'asbest...",['Air Quality and Health Concerns Post-Wildfir...,['The topic revolves around air quality issues...,['wednesday january 15th at pst webinar alert ...,"Public health and safety,Emergency resources,R...","Victim,Blame,Renewal",checked,checked,,
2,22,655,"['donations', 'donate', 'donation', 'to donate...",['Support and Donations for Those in Need'],['The topic revolves around the themes of supp...,['quick note on donations from someone who spe...,"Emergency resources,Recovery,Loss and damage","Renewal,Victim",checked,checked,,
3,2,4015,"['he', 'twitter', 'social', 'media', 'trump', ...","['Wealth Inequality, Greed, and Societal Solut...",['This topic centers around discussions of wea...,['im really amazed at a the rule based respons...,"Influential figures,Public health and safety,E...","Blame,Renewal,Victim",checked,checked,checked,
4,9,1384,"['housing', 'rebuild', 'wealthy', 'land', 'hou...","['Wealth, Housing Crisis, and Rebuilding Chall...",['The topic revolves around the challenges fac...,['couple things here we dont determine tactics...,"Recovery,Emergency resources,Loss and damage","Renewal,Blame,Victim",checked,checked,checked,


In [11]:
join_comments = pd.merge(long_comments, comments_df, left_on='topic_id', right_on='Topic', how='inner')
join_comments.head()

Unnamed: 0,post_id,comment_id,author,body,score,created_utc,corpus,corpus_length,topic_id,Topic,...,MaxMargin,OpenAI,Summary,Representative_Docs,Situational Awareness,Crisis Narrative,Grief,Mental,Equity,Notes
0,8eb4a7f25baa08ab,8442054266b04ba2,9822bf44dcd5e160,Just want to say I’m proud of our neighborhood...,44,2025-01-08 06:50:05,just want to say im proud of our neighborhood ...,17,8,8,...,"['so sorry', 'heart', 'safe', 'and your', 'are...",['Support and Solidarity After Natural Disaste...,['This topic revolves around expressions of em...,['sending love from aus gday la just wanted to...,"Recovery,Loss and damage","Renewal,Victim",checked,checked,,
1,8eb4a7f25baa08ab,5ff82d58fa0a702e,01fdd047a8438057,Altadena here—Lake and NY Drive—just got an em...,19,2025-01-08 00:03:16,altadena herelake and ny drivejust got an emai...,17,25,25,...,"['outage', 'without power', 'no power', 'power...",['Power Outages and Restoration Experiences'],['This topic revolves around the experiences a...,['if youre without power bc of the wind and no...,"Public health and safety,Emergency resources,L...","Blame,Victim",checked,,,
2,8eb4a7f25baa08ab,510af723b1739122,c7c27e4424323933,Anyone watching ABC 7? Leanne Suter and this c...,18,2025-01-08 05:58:05,anyone watching abc 7 leanne suter and this ca...,17,19,19,...,"['ktla', 'abc7', 'reporters', 'nbc', 'channels...",['Local News Coverage and Reporter Safety Conc...,"[""This topic revolves around the coverage of l...",['abc7 was doing the same thing on their youtu...,"Fire operations,Public health and safety,Loss ...",,,,checked,journalist
3,8eb4a7f25baa08ab,3069cf3895db0adb,8fc9d31e14c45345,My neighbor's gardeners were out using leaf bl...,35,2025-01-07 23:33:18,my neighbors gardeners were out using leaf blo...,11,3,3,...,"['air', 'air quality', 'smoke', 'aqi', 'asbest...",['Air Quality and Health Concerns Post-Wildfir...,['The topic revolves around air quality issues...,['wednesday january 15th at pst webinar alert ...,"Public health and safety,Emergency resources,R...","Victim,Blame,Renewal",checked,checked,,
4,8eb4a7f25baa08ab,d489ed58a0c24339,10fc29e91baf34dc,I just wanted to share a good link for all of ...,15,2025-01-08 07:09:49,i just wanted to share a good link for all of ...,110,12,12,...,"['gusts', 'santa', 'santa ana', 'wind is', 'ex...",['Santa Ana Winds Forecast and Conditions Upda...,['This topic focuses on the weather conditions...,['forecast discussion from nws los angeles sho...,"Fire operations,Public health and safety",,,,,


In [12]:
join_comments['topic_id'].value_counts()

Unnamed: 0_level_0,count
topic_id,Unnamed: 1_level_1
0,4456
1,4179
2,4015
3,3505
4,3272
5,2239
6,2058
7,1731
8,1408
9,1384


In [14]:
join_comments.to_csv(os.path.join(dataset_path, 'reddit/comments_join_multiple_label.csv'), index=False)
eaton_posts = pd.read_csv(os.path.join(dataset_path, 'reddit/eaton_final_posts.csv'))['post_id'].tolist()
hughes_posts = pd.read_csv(os.path.join(dataset_path, 'reddit/hughes_final_posts.csv'))['post_id'].tolist()
palisades_posts = pd.read_csv(os.path.join(dataset_path, 'reddit/palisades_final_posts.csv'))['post_id'].tolist()

In [15]:
# Define a function to assign fire_name based on post_id
def get_fire_name(post_id):
    fires = []
    if post_id in eaton_posts:
        fires.append('eaton')
    if post_id in palisades_posts:
        fires.append('palisades')

    if len(fires) > 1:
        return 'common'
    elif len(fires) == 1:
        return fires[0]
    else:
        return 'other'

# Create a new column 'fire_name' in the DataFrame
long_comments['fire_name'] = long_comments['post_id'].apply(get_fire_name)
long_comments['fire_name'].value_counts()

Unnamed: 0_level_0,count
fire_name,Unnamed: 1_level_1
common,28648
palisades,26038
eaton,17210
other,5298


In [16]:
long_comments['hour'] = pd.to_datetime(long_comments['created_utc'], format="%Y-%m-%d %H:%M:%S").dt.floor('H')
long_comments['hour']

Unnamed: 0,hour
0,2025-01-07 20:00:00
1,2025-01-08 06:00:00
2,2025-01-07 18:00:00
3,2025-01-07 19:00:00
4,2025-01-08 06:00:00
...,...
112406,2025-02-03 00:00:00
112407,2025-02-03 00:00:00
112408,2025-02-02 15:00:00
112411,2025-02-02 16:00:00
