# Step 1: Install And Import Python Libraries

In [None]:
# Install bertopic
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━

After installing `bertopic`, when we tried to import the `BERTopic` method, a type error about an unexpected keyword argument `cachedir` came up.

This `TypeError` is caused by the incompatibility between `joblib` and `HDBSCAN`. At the time this tutorial was created, `joblib` has a new release that is not supported by `HDBSCAN`. HDBSCAN does have a fix for it but has not been rolled out. So if you are watching this tutorial on YouTube or reading this tutorial on Medium.com at a later time, you may not encounter this error message.

In [None]:
!pip install --upgrade tensorflow


Collecting tensorflow
  Downloading tensorflow-2.15.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.15.0
    Uninstalling tensorflow-2.15.0:
      Successfully uninstalled tensorflow-2.15.0
Successfully installed tensorflow-2.15.0.post1


In [None]:
# Try to import BERTopic
from bertopic import BERTopic

In [None]:
# Data processing
import pandas as pd
import numpy as np
# Dimension reduction
from umap import UMAP

import csv
import os
import jieba
import re

import sys
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/My Drive"

Mounted at /content/drive
/content/drive/My Drive


#Tokenization





In [None]:
!pip install TCSP


Collecting TCSP
  Downloading TCSP-0.0.9-py3-none-any.whl (5.7 kB)
Installing collected packages: TCSP
Successfully installed TCSP-0.0.9


In [None]:
from TCSP import read_stopwords_list

# Calling the 'read_stopwords_list()' function will return the stopwords list
stopwords = read_stopwords_list()


In [None]:
# Parsing and remove stopwords
def pretty_cut(sentence):
    cut_list = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', sentence)), cut_all=False)
    for i in range(len(cut_list) - 1, -1, -1):
        if cut_list[i] in stopwords:
            del cut_list[i]
    return cut_list

# ————————————————
# 版权声明：本文为CSDN博主「银河小铁骑plus」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
# 原文链接：https://blog.csdn.net/weixin_47113960/article/details/125373275

In [None]:
# Create a new DataFrame to store the combined results

articles = pd.read_csv('zh_5.csv')

new_df = pd.DataFrame({
    'combined_content': articles.apply(lambda row: " ".join(pretty_cut(row['headline'])) + " " + " ".join(pretty_cut(row['content'])), axis=1),
    'timestamp': articles['pubdate']
})
print(new_df.head())



Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.800 seconds.
DEBUG:jieba:Loading model cost 0.800 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


                                    combined_content            timestamp
0  拜登 周末 簽署 暫停 債務 上限 法案 指違 約 災難性 拜登 指債務 違約將 災難性 美...  2023-06-03 07:22:00
1  愛情 生育 婚前 檢查 揭同 患 地中海 貧血女 醫生 未婚夫 憂禍 下一代 忍痛 分手 拆...  2023-06-03 08:00:00
2  奧 斯汀 美中 軍事 領導人 溝通 避免 誤解 誤判 奧 斯汀 指美 中國防 軍事 領導人 ...  2023-06-03 09:28:00
3  墨西哥 米 深 谷底 發現 袋 屍塊疑 接連 失踪 人員 搜救 人員運 走 尸塊 橙 訊 墨...  2023-06-03 09:53:00
4  白宮 拜登 最快 周六 簽署 債務 上限 法案 拜登 圖片 外媒報 道白 宮新聞 秘書 表示...  2023-06-03 10:11:19


In [None]:
new_df.to_csv('zh_5_parsed.csv', index=False)


In [None]:
# Target topics

target_topics_list = [

  ['股市',
  '股票交易',
  '股票指數',
  '股票投資策略',
  '股票市場分析',
  '藍籌股',
  '成長股',
  '價值股',
  '小盤股',
  '大盤股'],

  ['風險投資',
  '創業公司',
  '風險投資基金',
  '投資回報',
  '創業生態系統',
  '投資評估'],

  ['債務',
  '個人債務管理',
  '企業債務結構',
  '債券市場',
  '債務融資',
  '債務違約'],

  ['美元',
  '美元匯率',
  '美元指數',
  '美元政策',
  '美元供求關係',
  '美元對其他貨幣的影響'],

  ['房地產',
  '住宅市場',
  '商業地產',
  '房地產投資信託',
  '房地產開發',
  '房地產市場趨勢'],

  ['融資',
  '企業融資',
  '創業融資',
  '債務融資',
  '股權融資',],

  ['貸款',
  '個人貸款',
  '商業貸款',
  '房屋貸款',
  '汽車貸款',
  '學生貸款'],

  ['銀行',
  '銀行服務',
  '存款',
  '貸款審批',
  '金融與銀行業'],

  ['利率',
  '中央銀行利率政策',
  '借貸利率',
  '存款利率',
  '利率預測',
  '利率決策',
  '貨幣政策對利率的影響'],

  ['貨幣政策',
  '貨幣供應',
  '央行操作',
  '貨幣政策工具',
  '通脹目標'],

  ['貿易戰',
  '關稅',
  '貿易協定',
  '貿易平衡',
  '關稅戰略',
  '貿易限制措施'],

  ['資本流動',
  '直接投資',
  '資本管制',
  '資本流出',
  '資本流入',
  '跨境資金流動'],

  ['金融科技',
  '數字支付',
  '金融科技創新',
  '電子交易平台',
  '金融科技監管'],

  ['數字貨幣',
  '比特幣',
  '區塊鏈技術',
  '加密貨幣交易所',
  'ICO',
  '加密貨幣監管'],

  ['黃金',
  '黃金市場',
  '黃金投資',
  '黃金價格波動',
  '黃金儲備',
  '黃金交易策略'],

  ['原材料',
  '油價',
  '大宗商品交易',
  '農產品',
  '金屬',
  '能源產品'],

  ['全球經濟',
  '經濟增長預測',
  '國際貿易關係',
  '全球經濟政策',
  '外國直接投資',
  '全球供應鏈'],

  ['消費者信心',
  '消費者支出',
  '消費者調查',
  '消費者情緒指數',
  '消費者信貸',
  '消費者購買力'],

  ['政府支出',
  '基礎設施投資',
  '社會福利支出',
  '國防開支',
  '公共教育支出',
  '公共衛生支出'],

  ['外匯市場',
  '匯率決定',
  '外匯交易策略',
  '外匯市場參與者',
  '外匯交易類型',
  '外匯市場風險管理'],

  ['退休金',
  '養老金計劃',
  '退休金積累',
  '退休金投資組合',
  '養老金領取方式',
  '養老金改革'],

  ['社會保障',
  '社會保障制度',
  '社會保障福利',
  '社會保障稅',
  '社會保障援助',
  '社會保障可持續性']
]

In [None]:
zeroshot_topics_list = [' '.join(sub_list) for sub_list in target_topics_list]


#Embedding TM5

In [None]:
df = pd.read_csv('zh_5_parsed.csv')
contents = df['combined_content'].tolist()
# Creating timestamp
timestamps = df['timestamp'].to_list()
print(len(contents))
print(len(timestamps))

153218
153218


In [None]:
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedding_model.encode(contents, show_progress_bar=True)


.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/4789 [00:00<?, ?it/s]

In [None]:
import pickle
# Save the embeddings to a file
with open('zh_5_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)
# Load the embeddings from the file
# with open('achkust2023/10k-embeddings.pkl', 'rb') as f:
#     embeddings = pickle.load(f)

#Embedding TM0

In [None]:
df = pd.read_csv('zh_0_parsed.csv')
contents = df['combined_content'].tolist()
# Creating timestamp
timestamps = df['timestamp'].to_list()
print(len(contents))
print(len(timestamps))

153219
153219


In [None]:
embeddings = embedding_model.encode(contents, show_progress_bar=True)


Batches:   0%|          | 0/4789 [00:00<?, ?it/s]

In [None]:
import pickle
# Save the embeddings to a file
with open('zh_0_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)
# Load the embeddings from the file
# with open('achkust2023/10k-embeddings.pkl', 'rb') as f:
#     embeddings = pickle.load(f)