# Step 1: Install And Import Python Libraries

In [None]:
# Install bertopic
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━

After installing `bertopic`, when we tried to import the `BERTopic` method, a type error about an unexpected keyword argument `cachedir` came up.

This `TypeError` is caused by the incompatibility between `joblib` and `HDBSCAN`. At the time this tutorial was created, `joblib` has a new release that is not supported by `HDBSCAN`. HDBSCAN does have a fix for it but has not been rolled out. So if you are watching this tutorial on YouTube or reading this tutorial on Medium.com at a later time, you may not encounter this error message.

In [None]:
!pip install --upgrade tensorflow


Collecting tensorflow
  Downloading tensorflow-2.15.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.15.0
    Uninstalling tensorflow-2.15.0:
      Successfully uninstalled tensorflow-2.15.0
Successfully installed tensorflow-2.15.0.post1


In [None]:
# Try to import BERTopic
from bertopic import BERTopic

In [None]:
# Data processing
import pandas as pd
import numpy as np
# Dimension reduction
from umap import UMAP

import csv
import os
import jieba
import re

import sys
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/My Drive"

Mounted at /content/drive
/content/drive/My Drive


In [None]:
# import pickle
# with open('embeddings_zh.pkl', "rb") as file:
#   embeddings_zh = pickle.load(file)
# contents_zh = pd.read_csv('contents_zh.csv')

# contents_zh_list = contents_zh['combined_content'].to_list()
# # print(embeddings_zh)
# # embeddings_zh_list = embeddings_zh.to_list()

# timestamps = contents_zh['timestamp'].to_list()
# print(len(embeddings_zh))
# print(len(contents_zh))
# print(len(timestamps))

In [None]:
# Split the data into three parts
# total_len = len(embeddings_zh)
# part_len = total_len // 3

# embeddings_zh_part_1 = embeddings_zh[:part_len]
# embeddings_zh_part_2 = embeddings_zh[part_len:2 * part_len]
# embeddings_zh_part_3 = embeddings_zh[2 * part_len:]

# contents_zh_list_part_1 = contents_zh_list[:part_len]
# contents_zh_list_part_2 = contents_zh_list[part_len:2 * part_len]
# contents_zh_list_part_3 = contents_zh_list[2 * part_len:]

# timestamps_part_1 = timestamps[:part_len]
# timestamps_part_2 = timestamps[part_len:2 * part_len]
# timestamps_part_3 = timestamps[2 * part_len:]

# print(len(embeddings_zh_part_1))
# print(len(contents_zh_list_part_1))
# print(len(timestamps_part_1))

#Visualization

In [None]:
topics_over_time_1 = pd.read_csv('zeroshot_event_part_1_vis.csv')
topic_model_1 = BERTopic.load("zeroshot_event_part_1")

# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
topic_model_1.visualize_topics_over_time(topics_over_time_1)

In [None]:
topics_over_time_2 = pd.read_csv('zeroshot_event_part_2_vis.csv')
topic_model_2 = BERTopic.load("zeroshot_event_part_2")

# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
topic_model_2.visualize_topics_over_time(topics_over_time_2)

In [None]:
topics_over_time_3 = pd.read_csv('zeroshot_event_part_3_vis.csv')
topic_model_3 = BERTopic.load("zeroshot_event_part_3")

# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=30)
topic_model_3.visualize_topics_over_time(topics_over_time_3)