In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import VisualRepresentation

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset_path = 'WildFireCan-MMD.csv'
dataset = pd.read_csv(dataset_path)

In [None]:
images = dataset['image'].apply(lambda x: x[20:])
docs = dataset['text']

In [None]:
import matplotlib.pyplot as plt
# Step 1: Count words for each line
word_counts = docs.apply(lambda x: len(x.split()))
# Step 2: Plot the histogram of word counts
plt.figure(figsize=(8, 6))
plt.hist(word_counts, bins=30, color='skyblue', edgecolor='black')
# Add labels and title
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Histogram of Word Counts per Line in Docs')
# Show plot
plt.tight_layout()
plt.show()

# test

In [113]:
from bertopic.backend import MultiModalBackend
model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
# Embed both images and documents, then average them
doc_image_embeddings = model.embed(docs, images)

Token indices sequence length is longer than the specified maximum sequence length for this model (78 > 77). Running this sequence through the model will result in indexing errors


In [118]:
# representation_model = {
#     "Visual_Aspect": VisualRepresentation(image_to_text_model="nlpconnect/vit-gpt2-image-captioning")
# }
visual_model = VisualRepresentation()
representation_model = {
   "Visual_Aspect":  visual_model,
}

In [121]:
topic_model = BERTopic(embedding_model=model, representation_model=representation_model, min_topic_size=30)
topic_model.fit(documents=docs, images=images, embeddings=doc_image_embeddings)

100%|██████████| 29/29 [00:05<00:00,  5.55it/s]


<bertopic._bertopic.BERTopic at 0x1949cf30110>

In [125]:
#topic_model.visualize_barchart()
#topic_model.visualize_heatmap()
#topic_model.visualize_topics()
topic_model.visualize_hierarchy()
# topic_model.get_topic_info()

# old way

In [4]:
visual_model = VisualRepresentation()
representation_model = {
   "Visual_Aspect":  visual_model,
}

# remove stopwords after embedding
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

topic_model = BERTopic(
    representation_model = representation_model, 
    n_gram_range = (1, 2),
    #verbose=True,
    vectorizer_model = vectorizer_model,
    language = 'english', 
    calculate_probabilities = True,
    nr_topics = 'auto'
    )

In [92]:
topic_model.fit(documents=docs, images=images)

100%|██████████| 123/123 [00:20<00:00,  6.03it/s]
100%|██████████| 17/17 [00:03<00:00,  5.14it/s]


<bertopic._bertopic.BERTopic at 0x1950c63ef10>

In [93]:
# topic_model.save("bertopic_models/bcab&jasper_auto", serialization="safetensors")

In [None]:
import base64
from io import BytesIO
from IPython.display import HTML
from PIL import Image

def get_thumbnail(image_path, size=(100, 100)):
    try:
        im = Image.open(image_path)
        im.thumbnail(size)
        return im
    except Exception as e:
        print(f"Error generating thumbnail: {str(e)}")
        return None

def image_base64(im):
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

def image_formatter(im):
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'

# Extract dataframe
df = topic_model.get_topic_info()#.drop("Representative_Docs", 1).drop("Name", 1)

# Visualize the images
HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))

In [None]:
topic_model.get_topics()
# topic_model.visualize_barchart()
# topic_model.visualize_heatmap()
# topic_model.visualize_topics()
# topic_model.visualize_hierarchy()
# topic_model.get_topic_info()

# 15 topics

In [5]:
topic_model = BERTopic.load("bertopic_models/bcab&jasper_15")

In [6]:
topic_model.visualize_barchart()

In [7]:
topic_model.visualize_heatmap()

In [8]:
topic_model.visualize_topics()

In [70]:
topic_model.visualize_hierarchy()

In [71]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Visual_Aspect,Representative_Docs
0,-1,1281,-1_abwildfire_smoke_alberta_abfires,"[abwildfire, smoke, alberta, abfires, abfire, ...",<PIL.Image.Image image mode=RGB size=345x600 a...,[Here's a look at the current wildfires burnin...
1,0,2232,0_bcwildfire_wildfire_smoke_abfire,"[bcwildfire, wildfire, smoke, abfire, alberta,...",<PIL.Image.Image image mode=RGB size=326x600 a...,[Weather update on the wildfire situation in A...
2,1,359,1_jasper_jasperwildfire_town_jaspernationalpark,"[jasper, jasperwildfire, town, jaspernationalp...",<PIL.Image.Image image mode=RGB size=495x600 a...,[So sad to hear of the devastating fires up in...
3,2,197,2_abwildfire_alberta_support_albertans,"[abwildfire, alberta, support, albertans, ucp,...",<PIL.Image.Image image mode=RGB size=484x600 a...,"[Today, Prime Minister Justin Trudeau met with..."
4,3,178,3_edson_abfires_abwildfire_abwildfires,"[edson, abfires, abwildfire, abwildfires, abfi...",<PIL.Image.Image image mode=RGB size=793x600 a...,"[#ABFire Edson Airport #yeg , #ABFire Edson Ai..."
5,4,134,4_insurance_questions_amp share_share,"[insurance, questions, amp share, share, amp, ...",<PIL.Image.Image image mode=RGB size=899x600 a...,[If you have insurance questions related to ev...
6,5,60,5_calgary_calgary yyc_yycnow_abfires abwildfire,"[calgary, calgary yyc, yycnow, abfires abwildf...",<PIL.Image.Image image mode=RGB size=559x600 a...,[Feeling like you're rowing upstream with your...
7,6,50,6_eds_emergency_pack_grabandgo,"[eds, emergency, pack, grabandgo, kit, make, p...",<PIL.Image.Image image mode=RGB size=273x600 a...,"[Our EDS crew arrived in High Level, AB, today..."
8,7,46,7_smith_danielle smith_danielle_ableg,"[smith, danielle smith, danielle, ableg, abpol...",<PIL.Image.Image image mode=RGB size=281x600 a...,[Danielle Smith with an update from wildfire t...
9,8,39,8_kamalaharris_paris2024_biden2024_onstorm,"[kamalaharris, paris2024, biden2024, onstorm, ...",<PIL.Image.Image image mode=RGB size=603x600 a...,[#WEF #Covid19Ab #OnPoli #AbPoli #AbLeg #CndPo...


# 20 topics

In [9]:
topic_model = BERTopic.load("bertopic_models/bcab&jasper_20")

In [58]:
topic_model.visualize_barchart()

In [59]:
topic_model.visualize_heatmap()

In [61]:
topic_model.visualize_topics()

In [62]:
topic_model.visualize_hierarchy()

In [63]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Visual_Aspect,Representative_Docs
0,-1,1312,-1_abwildfire_wildfire_wildfires_alberta,"[abwildfire, wildfire, wildfires, alberta, bcw...",<PIL.Image.Image image mode=RGB size=872x600 a...,[The quickest and most reliable way to receive...
1,0,1097,0_bcwildfire_wildfire_service_bc,"[bcwildfire, wildfire, service, bc, creek, bcw...",<PIL.Image.Image image mode=RGB size=545x600 a...,[Crews are monitoring Fire Danger Rating close...
2,1,696,1_alberta_abwildfire_abfire_wildfire,"[alberta, abwildfire, abfire, wildfire, calgar...",<PIL.Image.Image image mode=RGB size=347x600 a...,[Weather update on the wildfire situation in A...
3,2,400,2_smoke_air_air quality_quality,"[smoke, air, air quality, quality, abfires, mo...",<PIL.Image.Image image mode=RGB size=360x600 a...,"[How bad is the air quality in Alberta? Well, ..."
4,3,373,3_jasper_jasperwildfire_town_jaspernationalpark,"[jasper, jasperwildfire, town, jaspernationalp...",<PIL.Image.Image image mode=RGB size=608x600 a...,[The beautiful Jasper National Park is on fire...
5,4,169,4_abfires_abwildfire_abwildfires_yyc,"[abfires, abwildfire, abwildfires, yyc, abfire...",<PIL.Image.Image image mode=RGB size=384x600 a...,[In anticipation of the upcoming heatwave in A...
6,5,162,5_forest_area_abwildfire_ban,"[forest, area, abwildfire, ban, wildfire, fore...",<PIL.Image.Image image mode=RGB size=852x600 a...,[A fire restriction is in effect for the Rocky...
7,6,118,6_insurance_questions_amp share_share,"[insurance, questions, amp share, share, share...",<PIL.Image.Image image mode=RGB size=898x600 a...,[If you have insurance questions related to ev...
8,7,69,7_emergency_evacuation_grabandgo_plan,"[emergency, evacuation, grabandgo, plan, bcwil...",<PIL.Image.Image image mode=RGB size=450x600 a...,[The first step to getting prepared is knowing...
9,8,46,8_yycnow_abfires abwildfire_yycbusiness_yycbus...,"[yycnow, abfires abwildfire, yycbusiness, yycb...",<PIL.Image.Image image mode=RGB size=788x600 a...,[Feeling like you're rowing upstream with your...


# 25 topics

In [3]:
topic_model = BERTopic.load("bertopic_models/bcab&jasper_25")

In [3]:
topic_model.visualize_barchart()

In [76]:
topic_model.visualize_heatmap()

In [77]:
topic_model.visualize_topics()

In [78]:
topic_model.visualize_hierarchy()

In [6]:
df = topic_model.get_topic_info()

In [9]:
df['Representation'][12]

['yycnow',
 'yycbusiness',
 'yycbusiness yycnow',
 'yyc yycbusiness',
 'abfires abwildfire',
 'calgary',
 'startups',
 'wordify',
 'smallbusiness',
 'smallbusiness startups']

# 30 topics

In [11]:
topic_model = BERTopic.load("bertopic_models/bcab&jasper_30")

In [100]:
topic_model.visualize_barchart()

In [12]:
topic_model.visualize_heatmap()

In [102]:
topic_model.visualize_topics()

In [103]:
topic_model.visualize_hierarchy()

In [104]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Visual_Aspect,Representative_Docs
0,-1,1306,-1_abfire_abwildfire_alberta_smoke,"[abfire, abwildfire, alberta, smoke, bcwildfir...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
1,0,764,0_bcwildfire_wildfire_service_creek,"[bcwildfire, wildfire, service, creek, evacuat...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
2,1,645,1_alberta_wildfires_abwildfire_abfire,"[alberta, wildfires, abwildfire, abfire, wildf...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
3,2,424,2_smoke_abfires_sunset_morning,"[smoke, abfires, sunset, morning, sun, smoky, ...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
4,3,361,3_jasper_jasperwildfire_town_jaspernationalpark,"[jasper, jasperwildfire, town, jaspernationalp...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
5,4,204,4_bcfire_kamloops_new bcfire_new,"[bcfire, kamloops, new bcfire, new, bcstorm, b...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
6,5,199,5_abwildfire_support_alberta_albertans,"[abwildfire, support, alberta, albertans, ucp,...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
7,6,158,6_safe_abfire_crews_edson,"[safe, abfire, crews, edson, town, eds, contin...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
8,7,144,7_insurance_questions_amp share_share,"[insurance, questions, amp share, share, amp, ...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
9,8,57,8_sturgeon lake_sturgeon_lake_wildfire,"[sturgeon lake, sturgeon, lake, wildfire, vall...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,


# 35 topics

In [13]:
topic_model = BERTopic.load("bertopic_models/bcab&jasper_35")

In [44]:
topic_model.visualize_barchart()

In [14]:
topic_model.visualize_heatmap()

In [15]:
topic_model.visualize_topics()

In [50]:
topic_model.visualize_hierarchy()

In [51]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Visual_Aspect,Representative_Docs
0,-1,1212,-1_abwildfire_abfire_alberta_wildfire,"[abwildfire, abfire, alberta, wildfire, bcwild...",<PIL.Image.Image image mode=RGB size=606x600 a...,[Weather update on the wildfire situation in A...
1,0,692,0_bcwildfire_wildfire_service_bcwildfire service,"[bcwildfire, wildfire, service, bcwildfire ser...",<PIL.Image.Image image mode=RGB size=409x600 a...,[Crews are monitoring Fire Danger Rating close...
2,1,635,1_alberta_wildfires_abwildfire_abfire,"[alberta, wildfires, abwildfire, abfire, wildf...",<PIL.Image.Image image mode=RGB size=694x600 a...,[Weather update on the wildfire situation in A...
3,2,369,2_jasper_jasperwildfire_town_jaspernationalpark,"[jasper, jasperwildfire, town, jaspernationalp...",<PIL.Image.Image image mode=RGB size=578x600 a...,[The beautiful Jasper National Park is on fire...
4,3,262,3_smoke_morning_today_air,"[smoke, morning, today, air, smoky, abfires, a...",<PIL.Image.Image image mode=RGB size=593x600 a...,[#ABWildfire smoke in the air this Thursday mo...
5,4,225,4_firesmart_forest_wildfire_bcwildfire,"[firesmart, forest, wildfire, bcwildfire, area...",<PIL.Image.Image image mode=RGB size=361x600 a...,[A fire restriction is in effect for the Rocky...
6,5,163,5_evacuation_order_evacuation order_alert,"[evacuation, order, evacuation order, alert, e...",<PIL.Image.Image image mode=RGB size=451x600 a...,"[Our EDS crew arrived in High Level, AB, today..."
7,6,126,6_abfires_yyc_abwildfires_abwildfire,"[abfires, yyc, abwildfires, abwildfire, yeg, s...",<PIL.Image.Image image mode=RGB size=403x600 a...,[Back in #yeg after a whirlwind of days in Nor...
8,7,110,7_support_abwildfire_donate_minister,"[support, abwildfire, donate, minister, help, ...",<PIL.Image.Image image mode=RGB size=600x600 a...,"[Today, Prime Minister Justin Trudeau met with..."
9,8,109,8_ucp_albertans_abwildfire_alberta,"[ucp, albertans, abwildfire, alberta, emergenc...",<PIL.Image.Image image mode=RGB size=387x600 a...,[Albertans evacuated due to #ABWildfire can st...


# auto topics

In [16]:
topic_model = BERTopic.load("bertopic_models/bcab&jasper_auto")

In [106]:
topic_model.visualize_barchart()

In [17]:
topic_model.visualize_heatmap()

In [18]:
topic_model.visualize_topics()

In [112]:
topic_model.visualize_hierarchy()

In [111]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Visual_Aspect,Representative_Docs
0,-1,1276,-1_abwildfire_alberta_abfire_smoke,"[abwildfire, alberta, abfire, smoke, wildfires...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
1,0,3015,0_wildfire_bcwildfire_abwildfire_alberta,"[wildfire, bcwildfire, abwildfire, alberta, ab...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
2,1,117,1_insurance_questions_amp share_share page,"[insurance, questions, amp share, share page, ...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
3,2,45,2_yycnow_abfires abwildfire_yycbusiness yycnow...,"[yycnow, abfires abwildfire, yycbusiness yycno...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
4,3,32,3_paris2024_kamalaharris_biden2024_onstorm,"[paris2024, kamalaharris, biden2024, onstorm, ...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
5,4,30,4_edson_town edson_town_abfire,"[edson, town edson, town, abfire, edson reside...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
6,5,27,5_365 days_days dnvfrs_365_247 365,"[365 days, days dnvfrs, 365, 247 365, safe 247...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
7,6,23,6_smith_danielle smith_danielle_ableg,"[smith, danielle smith, danielle, ableg, yyc a...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
8,7,20,7_transmission_lines_line_structures,"[transmission, lines, line, structures, damage...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
9,8,17,8_filters_respirator_filter_air,"[filters, respirator, filter, air, wear, parti...",<PIL.JpegImagePlugin.JpegImageFile image mode=...,
