In [1]:
from gensim import corpora, models
import jieba.posseg as jp, jieba
from load_document import load_documents, get_content


[使用LDA进行文档主题建模](https://github.com/duoergun0729/nlp/blob/master/%E4%BD%BF%E7%94%A8LDA%E8%BF%9B%E8%A1%8C%E6%96%87%E6%A1%A3%E4%B8%BB%E9%A2%98%E5%BB%BA%E6%A8%A1.md)

[Python+gensim【中文LDA】](https://blog.csdn.net/Yellow_python/article/details/83097994)

[scikit-learn LDA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)

In [None]:
demo_path = "/home/junetheriver/codes/qa_generation/huawei/data/UNC_20.9.5"
files = load_documents(demo_path)
files = get_content(files)

In [18]:
stop_words = ['html', 'cn', 'Document', 'zh', 'png', 'G']

stopwords_file = "/home/junetheriver/codes/qa_generation/huawei/data/stopwords/all_stopwords.txt"
with open(stopwords_file, 'r') as f:
    for line in f:
        stop_words.append(line.strip())

stop_words = set(stop_words)

words_ls = []
metadata_ls = []
flags = ('n', 'nr', 'ns', 'nt', 'eng', 'v', 'd')
# noun, proper noun for a person's name, proper noun for a place, proper noun for a organization, English word, verb, adverb

for file in files:
    words = [w.word for w in jp.cut(file['content']) if w.flag in flags and w.word not in stop_words]
    words_ls.append(words)
    metadata_ls.append(file)

In [21]:
import pickle as pkl
with open('words_ls.pkl', 'wb') as f:
    pkl.dump(words_ls, f)

In [22]:
# 得到文档-单词矩阵 （直接利用统计词频得到特征）
dictionary = corpora.Dictionary(words_ls)
# 将dictionary转化为一个词袋，得到文档-单词矩阵
corpus = [dictionary.doc2bow(words) for words in words_ls]
# 词袋处理后的结果，使用TF-IDF算法处理后，可以进一步提升LDA的效果
texts_tf_idf = models.TfidfModel(corpus)[corpus]

In [24]:
num_topics = 10
lda = models.LdaMulticore(corpus=texts_tf_idf, id2word=dictionary, num_topics=num_topics)

for topic in lda.print_topics(num_words=20):
    print(topic)

print(lda.inference(corpus))

(0, '0.011*"统计" + 0.010*"性能" + 0.009*"预留" + 0.006*"承载" + 0.006*"采集" + 0.005*"参数" + 0.005*"计算公式" + 0.005*"测量点" + 0.005*"相关" + 0.005*"单位" + 0.005*"消息" + 0.005*"APN" + 0.005*"次数" + 0.004*"说明书" + 0.004*"指标" + 0.004*"MME" + 0.004*"数" + 0.004*"失败" + 0.004*"指定" + 0.003*"SGW"')
(1, '0.006*"参数" + 0.005*"统计" + 0.004*"消息" + 0.003*"数" + 0.003*"SMF" + 0.003*"信元" + 0.003*"接收" + 0.003*"采集" + 0.003*"命令" + 0.003*"测量点" + 0.003*"计算公式" + 0.003*"接口" + 0.003*"MME" + 0.003*"发送" + 0.003*"次数" + 0.003*"单位" + 0.003*"PDU" + 0.003*"相关" + 0.003*"UPF" + 0.003*"指标"')
(2, '0.015*"性能" + 0.013*"预留" + 0.011*"统计" + 0.007*"说明书" + 0.006*"参数" + 0.006*"采集" + 0.005*"项" + 0.005*"计算公式" + 0.005*"测量点" + 0.005*"参考" + 0.005*"相关" + 0.005*"单位" + 0.004*"指标" + 0.004*"命令" + 0.004*"次数" + 0.003*"消息" + 0.003*"指定" + 0.003*"版本" + 0.003*"UE" + 0.003*"取"')
(3, '0.006*"统计" + 0.006*"参数" + 0.004*"消息" + 0.004*"次数" + 0.004*"测量点" + 0.004*"计算公式" + 0.004*"采集" + 0.003*"单位" + 0.003*"命令" + 0.003*"请求" + 0.003*"发送" + 0.003*"相关" + 0.003*"附着" + 0.003*"指标" + 0

In [9]:
for topic in lda.print_topics(num_words=10, num_topics=10):
    print(topic)

(0, '0.046*"消息" + 0.031*"UE" + 0.023*"流程" + 0.017*"发送" + 0.017*"MME" + 0.017*"Request" + 0.016*"承载" + 0.014*"PGW" + 0.013*"信元" + 0.012*"SMF"')
(1, '0.029*"配置" + 0.019*"特性" + 0.015*"用户" + 0.015*"ADD" + 0.013*"计费" + 0.012*"MML" + 0.011*"时" + 0.011*"场景" + 0.010*"业务" + 0.009*"支持"')
(2, '0.030*"用户" + 0.026*"网络" + 0.022*"MME" + 0.017*"SGSN" + 0.014*"路由" + 0.012*"模式" + 0.010*"G" + 0.010*"位置" + 0.010*"eNodeB" + 0.009*"业务"')
(3, '0.027*"统计" + 0.026*"次数" + 0.025*"消息" + 0.023*"参考" + 0.022*"类型" + 0.022*"指标" + 0.021*"相关" + 0.020*"KPI" + 0.017*"单位" + 0.017*"计算公式"')
(4, '0.019*"Document" + 0.013*"信息" + 0.012*"cn" + 0.011*"zh" + 0.011*"MML" + 0.008*"名称" + 0.008*"操作" + 0.008*"参数" + 0.008*"告警" + 0.007*"VNF"')


# Sklearn的LDA实现

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(words_ls)

num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda.fit(X)

# 获取文档的主题分布
doc_topic_dist = lda.transform(X)

# 对于每个主题，找到在该主题上概率最高的文档
n_top_docs = 2  # 每个主题最相关的2篇文档

for topic_idx in range(num_topics):
    print(f"主题 {topic_idx}:")
    # 获取第 topic_idx 个主题的文档概率
    topic_doc_probs = doc_topic_dist[:, topic_idx]
    # 按概率降序排序，获取最高的文档索引
    top_doc_indices = topic_doc_probs.argsort()[::-1][:n_top_docs]
    for doc_idx in top_doc_indices:
        print(f"文档 {doc_idx}（概率 {topic_doc_probs[doc_idx]:.4f}）:")
        # print(f"内容: {documents[doc_idx]}")
    print("\n")