# Step 1: Install And Import Python Libraries

In [None]:
# Install bertopic
!pip install bertopic



After installing `bertopic`, when we tried to import the `BERTopic` method, a type error about an unexpected keyword argument `cachedir` came up.

This `TypeError` is caused by the incompatibility between `joblib` and `HDBSCAN`. At the time this tutorial was created, `joblib` has a new release that is not supported by `HDBSCAN`. HDBSCAN does have a fix for it but has not been rolled out. So if you are watching this tutorial on YouTube or reading this tutorial on Medium.com at a later time, you may not encounter this error message.

In [None]:
!pip install --upgrade tensorflow




In [None]:
# Try to import BERTopic
from bertopic import BERTopic

In [None]:
# Data processing
import pandas as pd
import numpy as np
# Dimension reduction
from umap import UMAP

import csv
import os
import jieba
import re

import sys
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/My Drive"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive


In [None]:
import pickle
with open('embeddings_zh.pkl', "rb") as file:
  embeddings_zh = pickle.load(file)
contents_zh = pd.read_csv('contents_zh.csv')

contents_zh_list = contents_zh['combined_content'].to_list()
# print(embeddings_zh)
# embeddings_zh_list = embeddings_zh.to_list()

timestamps = contents_zh['timestamp'].to_list()
print(len(embeddings_zh))
print(len(contents_zh))
print(len(timestamps))

919312
919312
919312


In [None]:
# Split the data into three parts
total_len = len(embeddings_zh)
part_len = total_len // 3

embeddings_zh_part_1 = embeddings_zh[:part_len]
embeddings_zh_part_2 = embeddings_zh[part_len:2 * part_len]
embeddings_zh_part_3 = embeddings_zh[2 * part_len:]

contents_zh_list_part_1 = contents_zh_list[:part_len]
contents_zh_list_part_2 = contents_zh_list[part_len:2 * part_len]
contents_zh_list_part_3 = contents_zh_list[2 * part_len:]

timestamps_part_1 = timestamps[:part_len]
timestamps_part_2 = timestamps[part_len:2 * part_len]
timestamps_part_3 = timestamps[2 * part_len:]

print(len(embeddings_zh_part_1))


print(len(timestamps_part_1))

306437
306437
306437


In [None]:
print(len(contents_zh_list_part_1))
print(len(contents_zh_list_part_2))

print(len(contents_zh_list_part_3))

306437
306437
306438


#Zero shot


In [None]:
# geo_topic_list = [
#     ['香港', '港幣', '港交所', 'HSI', '恆指', '恆生指數'],
#     ['美國', '美元', '美聯儲', '納指', '道指'],
#     ['中國', '人民幣', '中國央行', '上證指數', '深成指'],
#     ['英國', '英鎊', '英國央行', '倫交所', 'FTSE'],
#     ['日本', '日元', '日本央行', '東證', '日經225'],
#     ['歐洲', '歐盟', '歐元', '歐洲央行', '歐洲股市', 'DAX', 'CAC'],
#     ['東南亞', '東協', '東南亞國家'],
#     ['全球', '全球經濟', '國際市場', '全球股市', '全球貿易']
# ]
# geo_zeroshot_topics_list = [' '.join(sub_list) for sub_list in geo_topic_list]


In [None]:
event_topic_list = [
    ['加息', '降息', '鮑威爾', '港元', '拆息', '金管局','利率', '國債', '匯率', '外匯', '央行','聯儲局', '美元', '美聯儲', '基點', '美聯'],
    ['股市', '指期', '指數','股價', '本港', '香港'],
    ['預算案', '減稅', '財政', '支持', '人民', '政府', '融資', '政策', '財政赤字', '財政措施','本港', '香港'],
    ['增長', '衰退', '經濟', '增速', '消費', '預測', '通脹', '市場', '購買力', '信貸', '經濟指標', 'GDP', '本港', '香港', '中國'],
    ['房地產', '地產', '房地產投資信託', '房地產開發', '樓市', '樓價', '住宅市場', '房屋貸款', '本港', '香港', '中國'],
    ['貸款', '銀行', '貸款產品', '信貸', '貸款利率', '貸款條件', '按揭', '本港', '香港'],
    ['數字貨幣', '比特幣', '區塊鏈技術', '加密貨幣交易所', 'ICO', '加密貨幣監管', '虛擬貨幣', '加密貨幣市場'],
    ['黃金', '黃金價格波動', '黃金儲備', '避險資產', '金價', '黃金市場'],
    ['原材料', '油價','原油', '石油', '每桶' ,'減產', '庫存', '需求' '大宗商品', '農產品', '金屬', '能源', '期貨交易', '天然氣']
]
event_zeroshot_topics_list = [' '.join(sub_list) for sub_list in event_topic_list]


In [None]:
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Initiate UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=42)
# Initiate clustering model
hdbscan_model = HDBSCAN(min_cluster_size=60, metric='euclidean', cluster_selection_method='eom', prediction_data=False)

# Initiate BERTopic
topic_model = BERTopic(verbose = True,  umap_model=umap_model, hdbscan_model=hdbscan_model, language="multilingual", calculate_probabilities=False, zeroshot_topic_list=event_zeroshot_topics_list,
    zeroshot_min_similarity=.50, embedding_model = embedding_model)


# Run BERTopic model
topics, probabilities = topic_model.fit_transform(contents_zh_list_part_3, embeddings_zh_part_3)


# Updata topic naming
# topic_model.update_topics(articles['content'], topics, n_gram_range=(1, 3))

# Save the model
topic_model.save("zeroshot_event_part_3")

2024-01-12 08:43:01,386 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-01-12 08:43:03,667 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-01-12 08:43:03,677 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-12 08:45:54,100 - BERTopic - Dimensionality - Completed ✓
2024-01-12 08:45:54,105 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-12 08:46:02,401 - BERTopic - Cluster - Completed ✓
2024-01-12 08:46:02,433 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-12 08:47:14,305 - BERTopic - Representation - Completed ✓
2024-01-12 08:47:29,912 - BERTopic - Zeroshot Step 2 - Clustering documents that were not found in the zero-shot model...
2024-01-12 08:47:30,024 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-12 08:47:30,040 - BERTopic - Dimensionality - Completed ✓
2024-01-12 08:47

In [61]:
topic_model = BERTopic.load("zeroshot_event_part_3")

info_df = topic_model.get_topic_info()

doc_info = topic_model.get_document_info(contents_zh_list_part_3)

#Sentiment Ana Prepare

In [62]:
docs_per_topics = doc_info.groupby(["Topic"]).apply(lambda x: x.index).to_dict()
filtered_items = list(docs_per_topics.items())[:9]
filtered_dict = dict(filtered_items)
for i in range(9):
  filtered_dict[i] = filtered_dict[i].tolist()
print(filtered_dict)
# info_df.to_csv('zeroshot_event_part_3_topic_info.csv', index=False)




{0: [2, 9, 11, 13, 14, 16, 23, 25, 26, 27, 30, 32, 36, 38, 43, 52, 57, 70, 77, 78, 84, 105, 107, 108, 109, 110, 115, 118, 119, 120, 124, 126, 128, 131, 132, 133, 136, 139, 140, 146, 152, 153, 157, 163, 164, 165, 173, 174, 177, 181, 195, 199, 205, 220, 223, 227, 234, 235, 243, 244, 259, 262, 263, 264, 266, 304, 306, 315, 316, 317, 319, 320, 323, 328, 329, 332, 334, 342, 343, 344, 348, 353, 360, 361, 363, 365, 366, 377, 378, 379, 395, 397, 399, 406, 409, 410, 422, 425, 435, 451, 463, 467, 476, 481, 486, 501, 507, 511, 516, 519, 537, 546, 551, 556, 565, 574, 577, 580, 587, 594, 602, 603, 606, 615, 617, 620, 621, 622, 624, 626, 630, 631, 632, 636, 639, 640, 645, 646, 648, 649, 657, 665, 667, 676, 677, 679, 680, 686, 692, 697, 705, 708, 709, 713, 724, 727, 734, 735, 736, 737, 741, 742, 745, 748, 749, 751, 752, 757, 759, 769, 770, 773, 776, 778, 780, 790, 801, 817, 826, 835, 836, 837, 842, 843, 844, 847, 850, 856, 860, 865, 866, 867, 868, 877, 879, 881, 886, 887, 891, 893, 902, 906, 917, 927

In [49]:
len(filtered_dict[1])

26929

In [57]:
dict1 = filtered_dict

In [60]:
dict2 = filtered_dict

In [63]:
dict3 = filtered_dict

In [66]:
dict3[1][44464]


919309

In [68]:
combined_dict = {}
offset = 306437
for key in dict2.keys():
  dict2[key] = [x + offset for x in dict2[key]]

  # Update the dictionary with the new value
offsetplus = offset*2
for key in dict3.keys():
  dict3[key] = [x + offsetplus for x in dict3[key]]
for key in dict2.keys():
  combined_dict[key] = dict1[key] + dict2[key] + dict3[key]

  # Update the dictionary with the new value
# for key in dict1.keys():
#     combined_dict[key] = dict1[key] + [x + offset for x in dict2[key]] + [x + offset*2 for x in dict3[key]]
    # offset += len(dict1[key])
# for key in dict2.keys():
#     combined_dict[key] += [x + offset for x in dict3[key]]
    # offset += len(dict2[key])
# print(combined_dict)
import json
# File path to save the dictionary
file_path = 'topic_docidx_dict_2.json'

# Convert dictionary to JSON string
json_data = json.dumps(combined_dict)

# Write JSON string to file
with open(file_path, 'w') as file:
    file.write(json_data)


In [None]:
for key, value in combined_dict.items():
    # Get the length of each list value
    value_length = len(value)

    print("Length of list under key", key, ":", value_length)

Length of list under key 0 : 177509
Length of list under key 1 : 104838
Length of list under key 2 : 67985
Length of list under key 3 : 51549
Length of list under key 4 : 29043
Length of list under key 5 : 15377
Length of list under key 6 : 9629
Length of list under key 7 : 8459
Length of list under key 8 : 6275
