In [None]:
import pandas as pd
import jieba
import re
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# --------- 数据读取与停用词 ---------
df = pd.read_csv('steam_reviews.csv')
df_cn = df[df['language'] == 'schinese'].copy()
comments = df_cn['review'].dropna().astype(str).tolist()

with open('stopwords1893.txt', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f)
stopwords.update(["感觉", "真的", "游戏", " "])

def preprocess_text(text, stopwords):
    return [w for w in jieba.lcut(text) if w not in stopwords and len(w) > 1 and re.search('[\u4e00-\u9fa5A-Za-z0-9]', w)]

# --------- LDA建模+一致性指标评估 ---------
def lda_with_coherence(texts, k_values, stop_words):
    tokenized_texts = [preprocess_text(t, stop_words) for t in texts]
    cv_scores, umass_scores = [], []
    for k in k_values:
        dictionary = corpora.Dictionary(tokenized_texts)
        corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=k,
            random_state=42,
            passes=10,
            iterations=50,
        )
        lda_topics = [[word for word, _ in lda_model.show_topic(tid, topn=10)] for tid in range(k)]
        # C_v
        cm_cv = CoherenceModel(topics=lda_topics, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
        cv_score = cm_cv.get_coherence()
        cv_scores.append(cv_score)
        # U_Mass
        cm_umass = CoherenceModel(topics=lda_topics, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        umass_score = cm_umass.get_coherence()
        umass_scores.append(umass_score)
        print(f"LDA 主题数k={k}: C_v={cv_score:.4f}, U_Mass={umass_score:.4f}")
    return cv_scores, umass_scores

# --------- BERTopic建模+一致性指标评估 ---------
def bertopic_with_coherence(texts, k_values, stop_words, platform_name):
    processed_texts = [" ".join(preprocess_text(text, stop_words)) for text in texts]
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    cv_scores, umass_scores = [], []
    for k in k_values:
        print(f"\n{platform_name}: 正在计算BERTopic主题数={k}")
        try:
            topic_model = BERTopic(
                embedding_model=embedding_model,
                nr_topics=k,
                verbose=False,
                calculate_probabilities=False,
                language="chinese (simplified)"
            )
            topics, _ = topic_model.fit_transform(processed_texts)
            # 组装每个主题的文档
            topic2docs = {}
            for idx, topic_id in enumerate(topics):
                topic2docs.setdefault(topic_id, []).append(processed_texts[idx])
            topics_docs = [ [w for doc in docs for w in doc.split()] for tid, docs in sorted(topic2docs.items()) if tid != -1]
            dictionary = corpora.Dictionary(topics_docs)
            corpus = [dictionary.doc2bow(text) for text in topics_docs]
            # C_v
            cm_cv = CoherenceModel(topics=topics_docs, texts=topics_docs, dictionary=dictionary, coherence='c_v')
            cv_score = cm_cv.get_coherence()
            cv_scores.append(cv_score)
            # U_Mass
            cm_umass = CoherenceModel(topics=topics_docs, corpus=corpus, dictionary=dictionary, coherence='u_mass')
            umass_score = cm_umass.get_coherence()
            umass_scores.append(umass_score)
            print(f"BERTopic k={k}: C_v={cv_score:.4f}, U_Mass={umass_score:.4f}")

        except Exception as e:
            print(f"主题数k={k} 计算出错: {e}")
            cv_scores.append(None)
            umass_scores.append(None)
    return cv_scores, umass_scores

# --------- 输出对比表格 ---------
def print_coherence_table(k_values, cv, umass, model_name):
    print(f"\n{model_name}主题一致性得分：")
    print("K\tC_v\t\tU_Mass")
    for k, c_v, u_m in zip(k_values, cv, umass):
        c_v_str = f"{c_v:.4f}" if c_v is not None else "None"
        u_m_str = f"{u_m:.4f}" if u_m is not None else "None"
        print(f"{k}\t{c_v_str}\t{u_m_str}")

# --------- 主流程 ---------
k_values = list(range(2, 13))

print("======LDA批量建模与一致性评估======")
lda_cv, lda_umass = lda_with_coherence(comments, k_values, stopwords)
print_coherence_table(k_values, lda_cv, lda_umass, "LDA")

print("\n======BERTopic批量建模与一致性评估======")
bertopic_cv, bertopic_umass = bertopic_with_coherence(comments, k_values, stopwords, "STEAM")
print_coherence_table(k_values, bertopic_cv, bertopic_umass, "BERTopic")

# --------- 可选：保存为DataFrame并输出表格 ---------
result_df = pd.DataFrame({
    "主题数": k_values,
    "LDA_C_v": lda_cv,
    "LDA_U_Mass": lda_umass,
    "BERTopic_C_v": bertopic_cv,
    "BERTopic_U_Mass": bertopic_umass
})
print("\n======一致性得分汇总表======")
print(result_df)

In [None]:
import requests
import re

system_message = '''
你是一个中文文本聚类主题命名助手。
请根据输入的主题关键词和代表性评论，总结该主题的内容，提炼成一个简短且准确的中文主题标题（6-12字为宜）。
只输出主题标题，不要添加任何多余解释。
'''

def generate_topic_title_ollama(keywords, examples, system_message, model="qwen3:8b"):
    user_prompt = (
        f"本主题的Top关键词有：{'、'.join(keywords)}。\n"
        f"部分代表性评论包括：\n"
        + "\n".join([f"- {c}" for c in examples])
        + "\n请为该主题总结一个简明主题标题。"
    )
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt}
        ],
        "stream": False
    }
    try:
        response = requests.post("http://localhost:11434/api/chat", json=payload)
        resp_json = response.json()
        title = resp_json['message']['content'].strip()
        # 去掉<think>标签和里面的内容
        title = re.sub(r"<think>.*?</think>", "", title, flags=re.DOTALL).strip()
        # 只保留首行或最多12字
        title = title.split('\n')[0][:12]
        return title
    except Exception as e:
        print(f"API error for topic: {keywords}, error: {e}")
        return "主题命名失败"


In [None]:
import pandas as pd
import jieba
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# 1. 数据与停用词准备
df = pd.read_csv('steam_reviews.csv')
df_cn = df[df['language'] == 'schinese'].copy()
comments = df_cn['review'].dropna().astype(str).tolist()

with open('stopwords1893.txt', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f)
stopwords.update(["感觉", "真的", "游戏", " "])

def preprocess_text(text, stopwords):
    def is_number(s):
        # 全是数字或小数点
        return re.fullmatch(r'\d+(\.\d+)?', s) is not None
    return [
        w for w in jieba.lcut(text)
        if w not in stopwords
        and len(w) > 1
        and re.search('[\u4e00-\u9fa5A-Za-z]', w)   # 至少包含中文或英文字母
        and not is_number(w)                        # 不是纯数字或小数
    ]

# 2. 预处理
processed_comments = [" ".join(preprocess_text(t, stopwords)) for t in comments]

# 3. 嵌入模型
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# 4. BERTopic建模（4个主题）
bertopic_model = BERTopic(
    embedding_model=embedding_model,
    nr_topics=5,
    language="chinese (simplified)",
    verbose=True
)
topics, probs = bertopic_model.fit_transform(processed_comments)

# 5. 输出每个主题的top关键词和样例评论
topic_titles = {}
for tid in bertopic_model.get_topics().keys():
    if tid == -1: continue
    keywords = [w for w, s in bertopic_model.get_topic(tid)[:10]]
    idxs = [i for i, t in enumerate(topics) if t == tid]
    examples = [comments[idx] for idx in idxs[:5]]

    # 调用本地大模型生成主题名
    title = generate_topic_title_ollama(keywords, examples, system_message)
    topic_titles[tid] = title
    print(f"\n=== 主题 {tid}：{title} ===")

# 可视化
fig = bertopic_model.visualize_barchart(top_n_topics=4, n_words=10, per_row=2)
for i, annotation in enumerate(fig['layout']['annotations']):
    topic_id = int(annotation.text.split(" ")[-1])
    if topic_id in topic_titles:
        annotation.text = topic_titles[topic_id]
fig.show() # 展示4个主题的Top关键词权重柱状图
fig.write_html("bertopic主题关键词柱状图.html")

2025-06-28 21:18:59,537 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/580 [00:00<?, ?it/s]

2025-06-28 21:20:14,657 - BERTopic - Embedding - Completed ✓
2025-06-28 21:20:14,659 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-28 21:20:18,763 - BERTopic - Dimensionality - Completed ✓
2025-06-28 21:20:18,764 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-28 21:20:20,995 - BERTopic - Cluster - Completed ✓
2025-06-28 21:20:20,996 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-28 21:20:21,565 - BERTopic - Representation - Completed ✓
2025-06-28 21:20:21,566 - BERTopic - Topic reduction - Reducing number of topics
2025-06-28 21:20:21,605 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-28 21:20:21,852 - BERTopic - Representation - Completed ✓
2025-06-28 21:20:21,855 - BERTopic - Topic reduction - Reduced number of topics from 283 to 5



=== 主题 0：剧情精彩好评如潮 ===

=== 主题 1：现实DLC扩展 ===

=== 主题 2：Steam游戏评价与争议 ===

=== 主题 3：2025年度最佳游戏 ===


In [20]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# 示例：四个主题自定义颜色（可自行挑选）
bar_colors = ["#E24A33", "#348ABD", "#988ED5", "#FBC15E"]

# 你的主题ID和中文标题
topic_ids = list(topic_titles.keys())[:4]
subplot_titles = [topic_titles[tid] for tid in topic_ids]

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=subplot_titles,
    vertical_spacing=0.1 
)

for i, tid in enumerate(topic_ids):
    keywords_scores = bertopic_model.get_topic(tid)[:10]
    keywords = [w for w, s in keywords_scores]
    scores = [s for w, s in keywords_scores]

    bar = go.Bar(
        x=scores,
        y=keywords,
        orientation='h',
        marker=dict(color=bar_colors[i]),
        showlegend=False
    )

    row, col = i//2 + 1, i%2 + 1
    fig.add_trace(bar, row=row, col=col)

# 全局美化
fig.update_layout(
    height=600,
    width=1000,
    font=dict(family="SimHei", size=14)
)
fig.update_yaxes(autorange="reversed")  # 让最重要的关键词在上方
fig.show()
fig.write_html("bertopic_2x2_colored.html")


In [None]:
import os
import pandas as pd
import jieba
import re
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from datetime import timedelta
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer

# 路径配置
DATA_PATH = "steam_reviews.csv"      # 评论数据
STOPWORDS_PATH = "stopwords1893.txt"     
CUSTOM_STOPWORDS = ["感觉", "真的", "游戏", " "]


# 读取自定义+通用停用词
def load_stopwords(path, custom_list):
    stopwords = set(custom_list)
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                word = line.strip()
                if word:
                    stopwords.add(word)
    return stopwords

stopwords = load_stopwords(STOPWORDS_PATH, CUSTOM_STOPWORDS)

# 文本预处理
def preprocess_text(text, stop_words_set):
    text = re.sub(r"http[s]?://\S+", "", str(text))
    text = re.sub(r"[^\u4e00-\u9fa5]", " ", text)
    words = jieba.lcut(text)
    words = [word for word in words if word.strip() and word not in stop_words_set]
    return " ".join(words)

# 加载评论数据
df = pd.read_csv(DATA_PATH)
df['datetime'] = pd.to_datetime(df['timestamp_updated'], unit='s') + timedelta(hours=8)
df = df[(df['datetime'] >= '2025-06-19') & (df['datetime'] <= '2025-06-23')]
df['processed_text'] = df['review'].apply(lambda x: preprocess_text(x, stopwords))
df = df[df['processed_text'].str.strip().astype(bool)]

# 按天分时间段
df['time_period'] = df['datetime'].dt.to_period('D')
min_docs_per_period = 20  # 每天最少多少条评论才建模
K = 5                    # 主题数
SIMILARITY_THRESHOLD = 0.05  # 相似度阈值

# 分天训练BERTopic
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
vectorizer_model = CountVectorizer(tokenizer=lambda x: x.split(), token_pattern=None)
period_models = {}
valid_periods = []

for period, group in df.groupby('time_period'):
    texts = group['processed_text'].tolist()
    if len(texts) < min_docs_per_period:
        continue
    topic_model = BERTopic(
        embedding_model=embedding_model,
        nr_topics=K,
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        verbose=False
    )
    topic_model.fit(texts)
    period_models[str(period)] = topic_model
    valid_periods.append(str(period))

# 计算主题相似度（JSD/重叠均可，这里用重叠词数比例，简明清晰）
def topic_overlap(model1, model2, k1, k2):
    topics1 = []
    for i in range(k1):
        topic = model1.get_topic(i)
        if not topic or topic is False or topic is None:
            topics1.append(set())
        else:
            topics1.append(set([w for w, _ in topic]))
    topics2 = []
    for j in range(k2):
        topic = model2.get_topic(j)
        if not topic or topic is False or topic is None:
            topics2.append(set())
        else:
            topics2.append(set([w for w, _ in topic]))
    sim_matrix = np.zeros((k1, k2))
    for i in range(k1):
        for j in range(k2):
            inter = topics1[i] & topics2[j]
            union = topics1[i] | topics2[j]
            if len(union) == 0:
                sim_matrix[i, j] = 0
            else:
                sim_matrix[i, j] = len(inter) / len(union)
    return sim_matrix

# 构建Sankey图数据
sankey_links = []
all_nodes = []
for i in range(len(valid_periods)-1):
    p1, p2 = valid_periods[i], valid_periods[i+1]
    model1, model2 = period_models[p1], period_models[p2]
    sim_matrix = topic_overlap(model1, model2, K, K)
    for s in range(K):
        for t in range(K):
            sim = sim_matrix[s, t]
            if sim >= SIMILARITY_THRESHOLD:
                src = f"{p1}_T{s}"
                tgt = f"{p2}_T{t}"
                all_nodes.extend([src, tgt])
                sankey_links.append({'source': src, 'target': tgt, 'value': sim})

all_nodes = sorted(list(set(all_nodes)), key=lambda x: (x.split('_T')[0], int(x.split('_T')[1])))
node_map = {label: i for i, label in enumerate(all_nodes)}
sankey_df = pd.DataFrame(sankey_links)
sankey_df['source_idx'] = sankey_df['source'].map(node_map)
sankey_df['target_idx'] = sankey_df['target'].map(node_map)

color_palette = px.colors.qualitative.Plotly
node_colors = []
for node_label in all_nodes:
    topic_id = int(node_label.split('_T')[1])
    node_colors.append(color_palette[topic_id % len(color_palette)])
# 画Sankey
node_labels = [x.split('_T')[0]+" 主题"+x.split('_T')[1] for x in all_nodes]
link = dict(
    source=sankey_df['source_idx'].tolist(),
    target=sankey_df['target_idx'].tolist(),
    value=sankey_df['value'].tolist(),
    color='rgba(180,180,180,0.35)'
)
node = dict(
    pad=12,
    thickness=18,
    line=dict(color="black", width=0.6),
    label=node_labels,
    color=node_colors
)
fig = go.Figure(go.Sankey(node=node, link=link))
fig.update_layout(
    title_text="BERTopic Steam评论主题随时间演化桑基图",
    font=dict(size=12),
    height=600,
    width=1100
)
fig.show()