In [1]:
from bertopic import BERTopic
import pandas as pd
from datasets import load_from_disk
import re
import csv
import base64
from io import BytesIO
from IPython.display import HTML
from PIL import Image

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [29]:
# Text:
def remove_links(text):
    # Remove links starting with 'https' or 'http'
    text_without_links = re.sub(r'https?\S+', '', text)
    return text_without_links

def extract_text_from_csv(csv_file):
    texts = []                                              # Create an empty list to store texts
    ids = []                                                # Create an empty list to store tweet IDs
    with open(csv_file, 'r', encoding='utf-8-sig') as file: # Open the CSV file in read mode
        reader = csv.DictReader(file)                       # Create a CSV reader object
        for row in reader:                                  # Iterate through each row in the CSV file
            text = row['text']                              # Get the value of the 'text' column in the current row
            id = row['tweet_id']                            # Get the value of the 'tweet_id' column in the current row
            if text != '':                                  # Check if the text is not empty
                cleaned = remove_links(text)                # Clean the text of links
                texts.append(cleaned)                       # Add the text to the 'texts' list
                ids.append(id)                              # Add the tweet ID to the 'ids' list
    return texts, ids                                       # Return the 'texts' list and the 'ids' list as a tuple

In [23]:
# Multimodal: Load the pre-trained BERTopic model
topic_model = BERTopic.load('NS_wildfire_2023~10,017(model)') 


In [27]:
# Text: Load the pre-trained BERTopic model
topic_model = BERTopic.load('ottawa_tornado_2018~10,009(text_model)') 
topics = topic_model.topics_

In [24]:
# Multimodal: Load the dataset
loaded_dataset = load_from_disk('NS_wildfire_2023~10,017(dataset)')
docs = loaded_dataset["text"]
images = loaded_dataset["image"]
topics = topic_model.topics_

In [30]:
# Text: Load the dataset
TEXTS, IDS = extract_text_from_csv('ottawa_tornado_2018~10,009.csv')

In [25]:
# Multimodal: print csv
csv_file_name = 'NS_wildfire_2023~10,017(multimodal_processed).csv'
useful_topics = [0,1,2,3,4,5,6,7,8,9,10,13,15,16,18,19,20,21,22,23,24,26,27,28,29,30,31,32]
representations = topic_model.get_document_info(docs).get("Representation")
visual_aspects = topic_model.get_document_info(docs).get("Visual_Aspect")
data = [{'text': text, 
         'image': image, 
         #'Visual Aspect': va, 
         'topic_num': num, 
         'Representation': rep, 
         'possibly_useful': 'True' if num in useful_topics else None}
          for text, image, va, num, rep in zip(docs, 
                                               images, 
                                               visual_aspects, 
                                               topics, 
                                               representations)]
with open(csv_file_name, 'w', newline='', encoding='utf-8-sig') as csvfile:
    fieldnames = ['text', 
                  'image', 
                  #'Visual Aspect', 
                  'topic_num', 
                  'Representation', 
                  'possibly_useful']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

In [32]:
# Text: print csv
csv_file_name = 'ottawa_tornado_2018~10,009(text_processed).csv'
representations = topic_model.get_document_info(TEXTS).get("Representation")
useful_topics = [0,1,2,3,4,5,6,7,8,10,11,13,15,16,17,18,21,22,24,25,26,27,28,30,32]
data = [{'text': text, 
         'topic_num': num, 
         'Representation': rep,
         'possibly_useful': 'True' if num in useful_topics else None}
          for text, num, rep in zip(TEXTS, topics, representations)]
with open(csv_file_name, 'w', newline='', encoding='utf-8-sig') as csvfile:
    fieldnames = ['text', 
                  'topic_num', 
                  'Representation',
                  'possibly_useful']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

In [33]:
# Load the labels.csv and output.csv files
name = 'ottawa_tornado_2018~10,009(text_processed).csv'
df1 = pd.read_csv('ottawa_tornado_text_labels.csv')
df2 = pd.read_csv(name)

# Merge the two dataframes based on 'topic_num'
merged_df = pd.merge(df2, df1[['topic_num', 'label']], on='topic_num', how='left')

# Rename the 'label' column to 'description'
merged_df.rename(columns={'label': 'description'}, inplace=True)

# Save the result back to 2.csv
merged_df.to_csv(name, index=False)


In [None]:
# Multimodal:
def get_thumbnail(image_path, size=(100, 100)):
    try:
        im = Image.open(image_path)
        im.thumbnail(size)
        return im
    except Exception as e:
        print(f"Error generating thumbnail: {str(e)}")
        return None

def image_base64(im):
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

def image_formatter(im):
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'

document_info = topic_model.get_document_info(docs).drop("Name", 1).drop("Representative_Docs", 1).drop("Top_n_words", 1).drop("Representative_document", 1)
#document_info = document_info[:3]
df = pd.DataFrame(document_info)
# Visualize the images
HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.get_topics()

In [None]:
topic_model.visualize_hierarchy()

In [26]:
topic_model.merge_topics(TEXTS, topics_to_merge=[37,6,65,10,39])