## Install and Import Dependences

In [None]:
!pip install seaborn
!pip install "numpy<=2.1.0"
!pip install openai
!pip install bertopic
!pip install nltk
!pip install matplotlib
!pip install sentence-transformers scikit-learn pandas
!pip install gensim
!pip install huggingface_hub
!pip install datamapplot

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import re
from pprint import pprint

import nltk
from nltk.tokenize import sent_tokenize
from bertopic import BERTopic

import pandas as pd
import numpy as np
from datetime import datetime
from huggingface_hub import snapshot_download

# modeling
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

import matplotlib.pyplot as plt

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

## (Optional) Train BERTopic with OpenAI with the optimized hyperparameters from ModelSelect.ipyn

### Train BERTopic

The following outlines the process for training BERTopic using optimized hyperparameters derived from the Model Selection phase:
*   Steps 1–5: These steps are consistently reproducible across iterations.
*   Step 6: This step introduces random parameters and relies on OpenAI model output, which may result in inconsistent outcomes across iterations.
*   Alternative Approaches: Users may leverage alternative generative models to produce human-readable topic labels. For more information, please refer to https://maartengr.github.io/BERTopic/algorithm/algorithm.html#6-optional-fine-tune-topic-representation


In [None]:
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import openai
from bertopic.representation import OpenAI

# Optimized hyperparameters from Model Selection step.
N_NEIGHBORS = 30
MIN_DIST = 0.01
MIN_CLUSTER_SIZE = 300
MIN_SAMPLES = 150

# Seed topic list
seed_topic_list = [["watchduty", "calfire", "containment", "drone", "images", "active", "inmate", "wind", "spread", "superscoopers"],
                   ["air quality", "evacuate", "school", "ash", "smoke", "safety", "health", "selfies", "power", "medical"],
                   ["water", "temporary", "mask", "pump", "rental", "housing", "eggs", "hydrant", "food", "laundry"],
                   ["insurance", "law", "community", "relief", "donation", "restore", "clean", "mental", "rebuilding", "benefit"],
                   ["burned down", "gone", "damage", "structures", "survived", "cars", "destruction", "trails", "victim", "lost"],
                   ["responsibility", "pro bono", "influencer", "twitter", "trump", "mayor", "concert", "volunteer", "therapy", "celebrity"]
                   ]

#Step1
embedding_model = SentenceTransformer('all-mpnet-base-v2')

#Step2
umap_model = UMAP(n_neighbors=N_NEIGHBORS, n_components=5, min_dist=MIN_DIST, metric='cosine', random_state = 42)

#Step3
hdbscan_model = HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, min_samples=MIN_SAMPLES,
                        gen_min_span_tree=True,
                        prediction_data=True)

#Step4
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

#Step5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = {
    "MaxMargin": MaximalMarginalRelevance(diversity=0.3),
    "OpenAI": OpenAI(client, model="gpt-4o-mini", chat=True, prompt=label_prompt, nr_docs=5, delay_in_seconds=10),
    "Summary": OpenAI(client, model="gpt-4o-mini", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=10)
}

#Step6 - (Optional) Fine-tune topic representations with  a `bertopic.representation` model

# Create your representation model with OpenAI
client = openai.OpenAI(api_key="") # Add your own OpenAI API key here.

label_prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label with five to ten words in the following format:
topic: <topic label>
"""


summarization_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]
In this topic, the following documents are a small but representative subset of all documents in the topic:
[DOCUMENTS]

Based on the information above, please give a description of this topic in the following format:
topic: <description>
"""

representation_model = {
    "MaxMargin": MaximalMarginalRelevance(diversity=0.3),
    "OpenAI": OpenAI(client, model="gpt-4o-mini", chat=True, prompt=label_prompt, nr_docs=5, delay_in_seconds=10),
    "Summary": OpenAI(client, model="gpt-4o-mini", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=10)
}

model_ft = BERTopic(
    seed_topic_list=seed_topic_list,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
    representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)

topic_ft = model_ft.fit(corpus)
topic_ft.representation_model = None
save_file = "/content/models/opt_model_ft_v2"
topic_ft.save(save_file, serialization="pickle")

## Load datasets

In [None]:
import os
from huggingface_hub import snapshot_download

# Download load comments datasets.
dataset_path = snapshot_download(
    repo_id="Dragmoon/2025CalifoniaWildfire",
    repo_type="dataset",
    local_dir="./datasets"
)

comments = pd.read_csv(os.path.join(dataset_path, 'reddit/all_final_comments.csv'))
short_comments = comments[comments['corpus_length'] < 10]
long_comments = comments[comments['corpus_length'] >= 10]
corpus = long_comments['corpus'].to_list()
len(corpus)

## Load Pretrained Model from Hugging Face

###Download Pretrained Optimized Model

In [None]:
# Download load comments datasets.
model_path = snapshot_download(
    repo_id="Dragmoon/BERTopicFire",
    local_dir="./models"
)

### Load Optimized Model

In [None]:
from bertopic import BERTopic
model_ft = BERTopic.load(os.path.join(model_path, 'safetensor'))

## OpenAI-Finetuned Model output visualization

In [None]:
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in model_ft.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
model_ft.set_topic_labels(chatgpt_topic_labels)

In [None]:
long_comments['topic_id'] = model_ft.topics_
long_comments[long_comments['topic_id'] != -1].count()

In [None]:
comments_df = pd.read_csv(os.path.join(dataset_path, 'reddit/all_final_comments_multiple_label.csv'))
comments_df.head()

In [None]:
join_comments = pd.merge(long_comments, comments_df, left_on='topic_id', right_on='Topic', how='inner')
join_comments.head()

In [None]:
join_comments['topic_id'].value_counts()

In [None]:
join_comments.to_csv(os.path.join(dataset_path, 'reddit/comments_join_multiple_label.csv'), index=False)
eaton_posts = pd.read_csv(os.path.join(dataset_path, 'reddit/eaton_final_posts.csv'))['post_id'].tolist()
hughes_posts = pd.read_csv(os.path.join(dataset_path, 'reddit/hughes_final_posts.csv'))['post_id'].tolist()
palisades_posts = pd.read_csv(os.path.join(dataset_path, 'reddit/palisades_final_posts.csv'))['post_id'].tolist()

In [None]:
# Define a function to assign fire_name based on post_id
def get_fire_name(post_id):
    fires = []
    if post_id in eaton_posts:
        fires.append('eaton')
    if post_id in palisades_posts:
        fires.append('palisades')

    if len(fires) > 1:
        return 'common'
    elif len(fires) == 1:
        return fires[0]
    else:
        return 'other'

# Create a new column 'fire_name' in the DataFrame
long_comments['fire_name'] = long_comments['post_id'].apply(get_fire_name)
long_comments['fire_name'].value_counts()

In [None]:
long_comments['hour'] = pd.to_datetime(long_comments['created_utc'], format="%Y-%m-%d %H:%M:%S").dt.floor('H')
long_comments['hour']