# Topic Modeling with BERTopic on Children's Career Development Literature
This notebook performs topic modeling using BERTopic and addresses the following research questions:

1. What specific topics have emerged from the empirical literature on children's career development, and what terms are associated with each topic?
2. How have these topics changed over time?
3. Which topics show increasing or decreasing trends, and which remain consistently popular?


In [1]:
# Import necessary libraries
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import matplotlib.pyplot as plt
import random
from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load data
file_path = 'Topic Modeling Articles_0601.xlsx'
df = pd.read_excel(file_path, sheet_name='Topic Modeling Articles')

# Combine title and abstract into one text field
df['text'] = df['Title'].fillna('') + ' ' + df['Abstract Note'].fillna('')

# Ensure 'Date' exists and is numeric (e.g., year)
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
df = df.dropna(subset=['Year', 'text'])  # Drop rows with missing year or text

: 

In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Use a BERT embedding model (you can change to 'all-mpnet-base-v2' or 'paraphrase-MiniLM-L6-v2')
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Fit BERTopic model
topic_model = BERTopic(embedding_model=embedding_model,verbose=True,low_memory = True)
topics, probs = topic_model.fit_transform(df['text'].tolist())

2025-07-25 16:57:17,758 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 4/4 [00:01<00:00,  2.31it/s]
2025-07-25 16:57:19,546 - BERTopic - Embedding - Completed ✓
2025-07-25 16:57:19,547 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
# Question 1: What specific topics and terms emerged?
topic_info = topic_model.get_topic_info()
topic_info.head()

In [None]:
# Question 2: How do topics change over time?
# Prepare time series data
topic_model.visualize_topics_over_time(docs, topics, timestamps=dates)

In [None]:
# Question 3: Identify increasing, decreasing, and stable topics
# Aggregate topic frequency by year
df = pd.DataFrame({'Year': dates, 'Topic': topics})
trend = df.groupby(['Year', 'Topic']).size().unstack(fill_value=0)
trend.plot(figsize=(12, 6), title='Topic Trends Over Time')
plt.ylabel('Document Count')
plt.xlabel('Year')
plt.grid(True)
plt.show()