<a href="https://colab.research.google.com/github/Jasoniee/paper_analysis_bzy/blob/main/bertopic_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modeling with BERTopic on Children's Career Development Literature
This notebook performs topic modeling using BERTopic and addresses the following research questions:

1. What specific topics have emerged from the empirical literature on children's career development, and what terms are associated with each topic?
2. How have these topics changed over time?
3. Which topics show increasing or decreasing trends, and which remain consistently popular?


In [1]:
# Import necessary libraries
!pip install bertopic
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import matplotlib.pyplot as plt
import random
from datetime import datetime


Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [5]:
print(len(df))

104


In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

# 下载 NLTK 所需资源
nltk.download('punkt')
nltk.download('stopwords')

# === 第一步：加载数据 ===
file_path = '/content/sample_data/Topic Modeling Articles_0731.xlsx'
df = pd.read_excel(file_path, sheet_name='Topic Modeling Articles')
df['text'] = df['Title'] + ' ' + df['Abstract Note'].fillna('')

# === 第二步：文本预处理（自定义领域停用词） ===
domain_stopwords = set(stopwords.words('english')).union({
    'career', 'child', 'development', 'student', 'study',
    'research', 'school', 'childrens', 'education', 'learning',
    'paper', 'article', 'find', 'result', 'literature'
})

def advanced_preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 保留英文字符和空格
    tokens = [word for word in text.split()
              if len(word) > 3 and word not in domain_stopwords]
    return tokens

df['processed'] = df['text'].apply(advanced_preprocess)
df['processed_str'] = df['processed'].apply(lambda x: ' '.join(x))

# === 第三步：计算嵌入向量 ===
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(df['processed_str'].tolist(), show_progress_bar=True)

# === 第四步：配置 CountVectorizer（支持 bi-gram，去除低频/高频词） ===
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=1,
    max_df=0.95
)

# === 第五步：使用 KMeans 聚类器，指定为 6 个主题 ===
kmeans_model = KMeans(n_clusters=6, random_state=42)

# === 第六步：构建 BERTopic 模型 ===
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=kmeans_model,  # 注意：此处其实是用作 clustering_model 而非真正 HDBSCAN
    top_n_words=10,
    calculate_probabilities=False,
    language="english",
    verbose=True
)

# === 第七步：训练模型 ===
topics, _ = topic_model.fit_transform(df['processed_str'].tolist(), embeddings)

# === 第八步：查看主题概览 ===
topic_info = topic_model.get_topic_info()
print(topic_info)
pd.set_option("display.max_colwidth", None)
# === 第九步：打印每个主题的关键词 ===
for topic_id in topic_info['Topic']:
    if topic_id != -1:
        print(f"\n--- Topic {topic_id} ---")
        for word, weight in topic_model.get_topic(topic_id):
            print(f"{word}: {weight:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-08-02 02:48:07,743 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-02 02:48:12,813 - BERTopic - Dimensionality - Completed ✓
2025-08-02 02:48:12,814 - BERTopic - Cluster - Start clustering the reduced embeddings
Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7a3ca3247f60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1187, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

   Topic  Count                                               Name  \
0      0     24           0_students_model_selfefficacy_elementary   
1      1     22             1_stem_students_engineering_elementary   
2      2     20                   2_science_girls_students_careers   
3      3     16                    3_health_rural_primary_students   
4      4     15  4_occupational_occupational aspirations_knowle...   
5      5      7                5_family_parents_influence_communal   

                                      Representation  \
0  [students, model, selfefficacy, elementary, re...   
1  [stem, students, engineering, elementary, scie...   
2  [science, girls, students, careers, stereotype...   
3  [health, rural, primary, students, careers, he...   
4  [occupational, occupational aspirations, knowl...   
5  [family, parents, influence, communal, involve...   

                                 Representative_Docs  
0  [effectiveness quartet card game increasing kn...  
1  [st

In [4]:
#print topic
print(topic_info)


   Topic  Count  \
0      0     24   
1      1     22   
2      2     20   
3      3     16   
4      4     15   
5      5      7   

                                                                     Name  \
0                                0_students_model_selfefficacy_elementary   
1                                  1_stem_students_engineering_elementary   
2                                        2_science_girls_students_careers   
3                                         3_health_rural_primary_students   
4  4_occupational_occupational aspirations_knowledge_vocational knowledge   
5                                     5_family_parents_influence_communal   

                                                                                                                                       Representation  \
0                            [students, model, selfefficacy, elementary, readiness, knowledge, childhood, academic, significant, elementary students]   
1                   

In [6]:

topics_over_time = topic_model.topics_over_time(df['processed_str'], df['Publication Year'])

# 可视化变化趋势
topic_model.visualize_topics_over_time(topics_over_time)

20it [00:00, 81.00it/s]


In [7]:
import pandas as pd
from scipy.stats import linregress
import matplotlib.pyplot as plt

# 获取 topic 时间分布表
topics_over_time_df = pd.DataFrame(topics_over_time)

# 保证 Publication Year 是整数类型
topics_over_time_df['Timestamp'] = topics_over_time_df['Timestamp'].astype(int)

# 创建空字典记录每个主题的线性趋势斜率
trend_slopes = {}

# 对每个 topic 分别回归分析
for topic in topics_over_time_df['Topic'].unique():
    topic_data = topics_over_time_df[topics_over_time_df['Topic'] == topic]
    if len(topic_data) >= 3:  # 至少三个点才计算趋势
        slope, intercept, r_value, p_value, std_err = linregress(
            topic_data['Timestamp'], topic_data['Frequency']
        )
        trend_slopes[topic] = slope

# 分类结果容器
increasing = []
decreasing = []
stable = []

# 分类标准（你可以调整阈值）
for topic, slope in trend_slopes.items():
    if slope > 0.1:
        increasing.append((topic, slope))
    elif slope < -0.1:
        decreasing.append((topic, slope))
    else:
        stable.append((topic, slope))

# 打印结果
print("📈 Increasing Topics:")
for t in increasing:
    print(f"  Topic {t[0]} - Slope: {t[1]:.2f}")

print("\n📉 Decreasing Topics:")
for t in decreasing:
    print(f"  Topic {t[0]} - Slope: {t[1]:.2f}")

print("\n➖ Stable Topics:")
for t in stable:
    print(f"  Topic {t[0]} - Slope: {t[1]:.2f}")

📈 Increasing Topics:
  Topic 0 - Slope: 0.13

📉 Decreasing Topics:
  Topic 4 - Slope: -0.11

➖ Stable Topics:
  Topic 3 - Slope: 0.01
  Topic 1 - Slope: 0.06
  Topic 5 - Slope: -0.01
  Topic 2 - Slope: 0.08
