In [1]:
import pandas as pd
import numpy as np
import jieba
from doc.pycurl.examples.quickstart.response_headers import encoding
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties

In [2]:
import numpy as np
from scipy.special import psi, polygamma

# 初始化参数
def initialize_parameters(documents, vocab_size, num_topics):
    phi = np.random.rand(len(documents), vocab_size, num_topics)
    gamma = np.random.rand(len(documents), num_topics)
    lambda_ = np.random.rand(num_topics, vocab_size)
    alpha = np.ones(num_topics)
    eta = np.ones(vocab_size)
    return phi, gamma, lambda_, alpha, eta

# 更新phi
def update_phi(d, n, k, documents, phi, gamma, lambda_):
    sum1 = 0
    for i in range(vocab_size):
        sum1 += documents[d][n][i] * (psi(lambda_[k][i]) - psi(np.sum(lambda_[k])))

    sum2 = psi(gamma[d][k]) - psi(np.sum(gamma[d]))

    phi_dnk = np.exp(sum1 + sum2)
    return phi_dnk

# 更新gamma
def update_gamma(k, d, alpha, phi):
    gamma_kd = alpha[k] + np.sum(phi[d][:, k])
    return gamma_kd

# 更新lambda_
def update_lambda(k, i, eta, phi, documents):
    lambda_ki = eta[i] + np.sum([np.sum([phi[d][n][k] * documents[d][n][i] for n in range(len(documents[d]))]) for d in range(len(documents))])
    return lambda_ki

# 更新alpha和eta
def update_alpha_eta(alpha, eta, phi, gamma, lambda_, documents, vocab_size, num_topics):
    grad_alpha = compute_grad_alpha(phi, gamma, lambda_)
    hessian_alpha = compute_hessian_alpha(phi, gamma, lambda_)
    new_alpha = alpha + grad_alpha / hessian_alpha

    grad_eta = compute_grad_eta(phi, lambda_)
    hessian_eta = compute_hessian_eta(phi, lambda_)
    new_eta = eta + grad_eta / hessian_eta

    return new_alpha, new_eta

# 计算ELBO
def compute_elbo(documents, vocab_size, num_topics, alpha, eta, phi, gamma, lambda_):
    elbo_part1 = 0
    for k in range(num_topics):
        elbo_part1 += np.sum((psi(lambda_[k]) - psi(np.sum(lambda_[k]))) * (lambda_[k] - eta))

    elbo_part2 = 0
    for d in range(len(documents)):
        for n in range(len(documents[d])):
            for k in range(num_topics):
                elbo_part2 += phi[d][n][k] * (psi(gamma[d][k]) - psi(np.sum(gamma[d])))

    elbo_part3 = 0
    for d in range(len(documents)):
        for k in range(num_topics):
            elbo_part3 += (gamma[d][k] - alpha[k]) * (psi(gamma[d][k]) - psi(np.sum(gamma[d])))

    elbo_part4 = 0
    for d in range(len(documents)):
        for n in range(len(documents[d])):
            for k in range(num_topics):
                elbo_part4 += phi[d][n][k] * np.log(lambda_[k][documents[d][n]])

    elbo_part5 = 0
    for k in range(num_topics):
        elbo_part5 -= np.sum((lambda_[k] - 1) * (psi(lambda_[k]) - psi(np.sum(lambda_[k]))))

    elbo_part6 = 0
    for d in range(len(documents)):
        for n in range(len(documents[d])):
            for k in range(num_topics):
                elbo_part6 -= phi[d][n][k] * (psi(gamma[d][k]) - psi(np.sum(gamma[d])))

    elbo_part7 = 0
    for d in range(len(documents)):
        for k in range(num_topics):
            elbo_part7 -= (gamma[d][k] - alpha[k]) * (psi(gamma[d][k]) - psi(np.sum(gamma[d])))

    elbo = elbo_part1 + elbo_part2 + elbo_part3 + elbo_part4 + elbo_part5 + elbo_part6 + elbo_part7
    return elbo

# 计算 $\alpha$ 的一阶导数
def compute_grad_alpha(phi, gamma, lambda_):
    M = len(gamma)
    K = len(gamma[0])

    grad_alpha = np.zeros(K)
    for k in range(K):
        sum_alpha = np.sum(gamma[:, k])
        grad_alpha[k] = M * (polygamma(0, sum_alpha) - psi(gamma[:, k]))

        for d in range(M):
            sum_gamma_d = np.sum(gamma[d])
            grad_alpha[k] += psi(gamma[d][k]) - psi(sum_gamma_d)

    return grad_alpha

# 计算 $\alpha$ 的二阶导数
def compute_hessian_alpha(phi, gamma, lambda_):
    M = len(gamma)
    K = len(gamma[0])

    hessian_alpha = np.zeros((K, K))
    for k in range(K):
        for j in range(K):
            sum_alpha = np.sum(gamma[:, k])
            if k == j:
                hessian_alpha[k, j] = M * (polygamma(1, sum_alpha) - polygamma(1, gamma[:, k]))
            else:
                hessian_alpha[k, j] = M * polygamma(1, sum_alpha)

    return hessian_alpha

# 计算 $\eta$ 的一阶导数
def compute_grad_eta(phi, lambda_):
    V = lambda_.shape[1]
    K = lambda_.shape[0]

    grad_eta = np.zeros(V)
    for i in range(V):
        sum_eta = np.sum(lambda_[:, i])
        grad_eta[i] = K * (polygamma(0, sum_eta) - psi(lambda_[:, i]))

        for k in range(K):
            sum_lambda_k = np.sum(lambda_[k])
            grad_eta[i] += psi(lambda_[k][i]) - psi(sum_lambda_k)

    return grad_eta

# 计算 $\eta$ 的二阶导数
def compute_hessian_eta(phi, lambda_):
    V = lambda_.shape[1]
    K = lambda_.shape[0]

    hessian_eta = np.zeros((V, V))
    for i in range(V):
        for j in range(V):
            sum_eta = np.sum(lambda_[:, i])
            if i == j:
                hessian_eta[i, j] = K * (polygamma(1, sum_eta) - polygamma(1, lambda_[:, i]))
            else:
                hessian_eta[i, j] = K * polygamma(1, sum_eta)

    return hessian_eta

# 检查是否收敛
def all_converged(phi, gamma, lambda_, alpha, eta, prev_alpha, prev_eta, elbos, param_tol=1e-3, elbo_tol=1e-3, patience=5):
    # 检查参数变化是否小于阈值
    alpha_diff = np.linalg.norm(alpha - prev_alpha)
    eta_diff = np.linalg.norm(eta - prev_eta)
    param_converged = alpha_diff < param_tol and eta_diff < param_tol

    # 检查ELBO的变化是否小于阈值
    if len(elbos) < patience + 1:
        elbo_converged = False
    else:
        recent_gains = [elbos[-i] - elbos[-i-1] for i in range(1, patience+1)]
        max_gain = max(recent_gains)
        elbo_converged = max_gain < elbo_tol

    return param_converged and elbo_converged

# 主函数
def lda_variational_em(documents, vocab_size, num_topics, alpha, eta):
    phi, gamma, lambda_, alpha, eta = initialize_parameters(documents, vocab_size, num_topics)
    elbos = []
    prev_alpha = alpha.copy()
    prev_eta = eta.copy()

    while True:
        # E步迭代循环
        for d in range(len(documents)):
            for n in range(len(documents[d])):
                for k in range(num_topics):
                    phi[d][n][k] = update_phi(d, n, k, documents, phi, gamma, lambda_)
                    phi[d][n] /= np.sum(phi[d][n])

        # 更新gamma
        for d in range(len(documents)):
            for k in range(num_topics):
                gamma[d][k] = update_gamma(k, d, alpha, phi)

        # 更新lambda_
        for k in range(num_topics):
            for i in range(vocab_size):
                lambda_[k][i] = update_lambda(k, i, eta, phi, documents)

        # 更新alpha和eta
        alpha, eta = update_alpha_eta(alpha, eta, phi, gamma, lambda_, documents, vocab_size, num_topics)

        # 计算ELBO
        elbo = compute_elbo(documents, vocab_size, num_topics, alpha, eta, phi, gamma, lambda_)
        elbos.append(elbo)

In [3]:
with open('./nlp_test1.txt', 'r', encoding='utf-8') as f3:
    res1 = f3.read()

print(res1)

沙 瑞金 赞叹 易 学习 的 胸怀 ， 是 金山 的 百姓 有福 ， 可是 这件 事对 李达康 的 触动 很大 。 易 学习 又 回忆起 他们 三人 分开 的 前一晚 ， 大家 一起 喝酒 话别 ， 易 学习 被 降职 到 道口 县当 县长 ， 王 大路 下海经商 ， 李达康 连连 赔礼道歉 ， 觉得 对不起 大家 ， 他 最 对不起 的 是 王 大路 ， 就 和 易 学习 一起 给 王 大路 凑 了 5 万块 钱 ， 王 大路 自己 东挪西撮 了 5 万块 ， 开始 下海经商 。 没想到 后来 王 大路 竟然 做 得 风生水 起 。 沙 瑞金 觉得 他们 三人 ， 在 困难 时期 还 能 以沫 相助 ， 很 不 容易 。


In [4]:
with open('./nlp_test3.txt', 'r', encoding='utf-8') as f4:
    res2 = f4.read()
print(res2)

沙瑞金 向 毛娅 打听 他们 家 在 京州 的 别墅 ， 毛娅 笑 着 说 ， 王大路 事业有成 之后 ， 要 给 欧阳 菁 和 她 公司 的 股权 ， 她们 没有 要 ， 王大路 就 在 京州 帝豪园 买 了 三套 别墅 ， 可是 李达康 和 易学习 都 不要 ， 这些 房子 都 在 王大路 的 名下 ， 欧阳 菁 好像 去 住 过 ， 毛娅 不想 去 ， 她 觉得 房子 太大 很 浪费 ， 自己 家住 得 就 很 踏实 。


In [5]:
# 文本预处理函数
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_words)

# 读取并预处理文件内容
def load_and_preprocess_files(file_paths):
    documents = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            processed_text = preprocess_text(text)
            documents.append(processed_text)
    return documents

# 创建文档-词项矩阵
def create_document_term_matrix(documents, vocab_size=None):
    vectorizer = CountVectorizer(max_features=vocab_size)
    dtm = vectorizer.fit_transform(documents).toarray()
    vocabulary = vectorizer.get_feature_names_out()
    return dtm, vocabulary

# 初始化参数
def initialize_parameters(dtm, num_topics):
    phi = np.random.rand(len(dtm), dtm.shape[1], num_topics)
    gamma = np.random.rand(len(dtm), num_topics)
    lambda_ = np.random.rand(num_topics, dtm.shape[1])
    alpha = np.ones(num_topics)
    eta = np.ones(dtm.shape[1])
    return phi, gamma, lambda_, alpha, eta


In [13]:
import jieba
from sklearn.feature_extraction.text import CountVectorizer

# 从文件导入停用词表
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()
    return set(stopwords)  # 使用集合提高查找效率

# 文本预处理函数（使用 jieba 进行分词）
def preprocess_text(text, stop_words):
    words = jieba.lcut(text)  # 使用 jieba 进行分词
    filtered_words = [word for word in words if word.strip() and word not in stop_words]
    return ' '.join(filtered_words)

# 读取并预处理文件内容
def load_and_preprocess_files(file_paths, stop_words):
    documents = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            processed_text = preprocess_text(text, stop_words)
            documents.append(processed_text)
    return documents

# 创建文档-词项矩阵
def create_document_term_matrix(documents, vocab_size=None):
    vectorizer = CountVectorizer(max_features=vocab_size, token_pattern=r"(?u)\b\w+\b")
    dtm = vectorizer.fit_transform(documents).toarray()
    vocabulary = vectorizer.get_feature_names_out()
    return dtm, vocabulary

# 示例应用
stpwrdpath = "stop_words.txt"
stop_words = load_stopwords(stpwrdpath)
file_paths = ['./nlp_test1.txt', './nlp_test3.txt']
documents = load_and_preprocess_files(file_paths, stop_words)
dtm, vocabulary = create_document_term_matrix(documents)

print("Vocabulary:", vocabulary)
print("Document-Term Matrix:\n", dtm)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 6: invalid start byte

In [10]:
corpus = [res1, res2]
cntVector = CountVectorizer(stop_words=stpwrdlst)
cntTf = cntVector.fit_transform(corpus)
print(cntTf)
corpus = [res1,res2]
vector = TfidfVectorizer(stop_words=stpwrdlst)
tfidf = vector.fit_transform(corpus)
print (tfidf)

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


NameError: name 'stpwrdlst' is not defined