# Topic Modeling with BERTopic on Children's Career Development Literature
This notebook performs topic modeling using BERTopic and addresses the following research questions:

1. What specific topics have emerged from the empirical literature on children's career development, and what terms are associated with each topic?
2. How have these topics changed over time?
3. Which topics show increasing or decreasing trends, and which remain consistently popular?


In [1]:
# Import necessary libraries
!pip install bertopic
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import matplotlib.pyplot as plt
import random
from datetime import datetime




In [7]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [21]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

# 下载 NLTK 所需资源
nltk.download('punkt')
nltk.download('stopwords')

# === 第一步：加载数据 ===
file_path = '/content/sample_data/Topic Modeling Articles_0601.xlsx'
df = pd.read_excel(file_path, sheet_name='Topic Modeling Articles')
df['text'] = df['Title'] + ' ' + df['Abstract Note'].fillna('')

# === 第二步：文本预处理（自定义领域停用词） ===
domain_stopwords = set(stopwords.words('english')).union({
    'career', 'child', 'development', 'student', 'study',
    'research', 'school', 'childrens', 'education', 'learning',
    'paper', 'article', 'find', 'result', 'literature'
})

def advanced_preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 保留英文字符和空格
    tokens = [word for word in text.split()
              if len(word) > 3 and word not in domain_stopwords]
    return tokens

df['processed'] = df['text'].apply(advanced_preprocess)
df['processed_str'] = df['processed'].apply(lambda x: ' '.join(x))

# === 第三步：计算嵌入向量 ===
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(df['processed_str'].tolist(), show_progress_bar=True)

# === 第四步：配置 CountVectorizer（支持 bi-gram，去除低频/高频词） ===
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=1,
    max_df=0.95
)

# === 第五步：使用 KMeans 聚类器，指定为 6 个主题 ===
kmeans_model = KMeans(n_clusters=6, random_state=42)

# === 第六步：构建 BERTopic 模型 ===
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=kmeans_model,  # 注意：此处其实是用作 clustering_model 而非真正 HDBSCAN
    top_n_words=10,
    calculate_probabilities=False,
    language="english",
    verbose=True
)

# === 第七步：训练模型 ===
topics, _ = topic_model.fit_transform(df['processed_str'].tolist(), embeddings)

# === 第八步：查看主题概览 ===
topic_info = topic_model.get_topic_info()
print(topic_info)
pd.set_option("display.max_colwidth", None)
# === 第九步：打印每个主题的关键词 ===
for topic_id in topic_info['Topic']:
    if topic_id != -1:
        print(f"\n--- Topic {topic_id} ---")
        for word, weight in topic_model.get_topic(topic_id):
            print(f"{word}: {weight:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-07-25 11:43:07,342 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-25 11:43:07,541 - BERTopic - Dimensionality - Completed ✓
2025-07-25 11:43:07,543 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-25 11:43:07,555 - BERTopic - Cluster - Completed ✓
2025-07-25 11:43:07,562 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-25 11:43:07,628 - BERTopic - Representation - Completed ✓


   Topic  Count                                               Name  \
0      0     24                    0_stem_engineering_science_math   
1      1     20            1_model_selfefficacy_readiness_academic   
2      2     20  2_occupational_family_occupational aspirations...   
3      3     15        3_exploration_qualitative_influences_eacute   
4      4     15                 4_health_rural_field_professionals   
5      5     10             5_science_stereotypes_occupations_male   

                                      Representation  \
0  [stem, engineering, science, math, computer, e...   
1  [model, selfefficacy, readiness, academic, ach...   
2  [occupational, family, occupational aspiration...   
3  [exploration, qualitative, influences, eacute,...   
4  [health, rural, field, professionals, medical,...   
5  [science, stereotypes, occupations, male, earl...   

                                 Representative_Docs  
0  [measuring stem awareness interest middle chil...  
1  [ma

In [23]:
#print topic
print(topic_info)


   Topic  Count                                                       Name  \
0      0     24                            0_stem_engineering_science_math   
1      1     20                    1_model_selfefficacy_readiness_academic   
2      2     20  2_occupational_family_occupational aspirations_vocational   
3      3     15                3_exploration_qualitative_influences_eacute   
4      4     15                         4_health_rural_field_professionals   
5      5     10                     5_science_stereotypes_occupations_male   

                                                                                                                                Representation  \
0                                             [stem, engineering, science, math, computer, early, media, design, understanding, participation]   
1                               [model, selfefficacy, readiness, academic, achievement, significant, elementary students, skills, scale, ccca]   
2  [occupationa

In [19]:

topics_over_time = topic_model.topics_over_time(df['processed_str'], df['Publication Year'])

# 可视化变化趋势
topic_model.visualize_topics_over_time(topics_over_time)

20it [00:01, 15.43it/s]


In [20]:
import pandas as pd
from scipy.stats import linregress
import matplotlib.pyplot as plt

# 获取 topic 时间分布表
topics_over_time_df = pd.DataFrame(topics_over_time)

# 保证 Publication Year 是整数类型
topics_over_time_df['Timestamp'] = topics_over_time_df['Timestamp'].astype(int)

# 创建空字典记录每个主题的线性趋势斜率
trend_slopes = {}

# 对每个 topic 分别回归分析
for topic in topics_over_time_df['Topic'].unique():
    topic_data = topics_over_time_df[topics_over_time_df['Topic'] == topic]
    if len(topic_data) >= 3:  # 至少三个点才计算趋势
        slope, intercept, r_value, p_value, std_err = linregress(
            topic_data['Timestamp'], topic_data['Frequency']
        )
        trend_slopes[topic] = slope

# 分类结果容器
increasing = []
decreasing = []
stable = []

# 分类标准（你可以调整阈值）
for topic, slope in trend_slopes.items():
    if slope > 0.1:
        increasing.append((topic, slope))
    elif slope < -0.1:
        decreasing.append((topic, slope))
    else:
        stable.append((topic, slope))

# 打印结果
print("📈 Increasing Topics:")
for t in increasing:
    print(f"  Topic {t[0]} - Slope: {t[1]:.2f}")

print("\n📉 Decreasing Topics:")
for t in decreasing:
    print(f"  Topic {t[0]} - Slope: {t[1]:.2f}")

print("\n➖ Stable Topics:")
for t in stable:
    print(f"  Topic {t[0]} - Slope: {t[1]:.2f}")

📈 Increasing Topics:
  Topic 0 - Slope: 0.16
  Topic 5 - Slope: 0.16

📉 Decreasing Topics:

➖ Stable Topics:
  Topic 1 - Slope: -0.10
  Topic 4 - Slope: 0.02
  Topic 2 - Slope: 0.04
  Topic 3 - Slope: 0.07
