In [1]:
from gensim.test.utils import common_texts
from gensim.models import TfidfModel, LdaModel, LsiModel
from gensim.corpora import Dictionary

In [10]:
# 稀疏的向量转换为正常向量
def sparse_2_norm_vector(vector, ndim):
    result = [0.0] * ndim
    for idx, value in vector:
        result[idx] = value
    return result

# 一、数据加载

In [2]:
# 构建字典
common_dictionary = Dictionary(common_texts)
# 各个文本对应的词袋法的值
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
print("原始数据:\n{}".format(common_texts))
print("\n词袋法后的值:\n{}".format(common_corpus))

原始数据:
[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]

词袋法后的值:
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [3]:
other_texts = [
    ['computer', 'time', 'graph'],
    ['survey', 'response', 'eps'],
    ['human', 'system', 'computer']
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
print("测试数据对应的词袋法的值:\n{}".format(other_corpus))

测试数据对应的词袋法的值:
[[(0, 1), (6, 1), (10, 1)], [(3, 1), (4, 1), (8, 1)], [(0, 1), (1, 1), (5, 1)]]


# 二、TF-IDF Model

In [4]:
# 模型构建
model = TfidfModel(corpus=common_corpus)

In [11]:
# 预测
vectors = model[other_corpus]
print("稀疏表达形式:")
for vector in vectors:
    print(vector)

print("\n正常向量表达形式:")
for vector in vectors:
    print(sparse_2_norm_vector(vector, ndim=len(common_dictionary.token2id)))

稀疏表达形式:
[(0, 0.6282580468670046), (6, 0.6282580468670046), (10, 0.45889394536615247)]
[(3, 0.5773502691896257), (4, 0.5773502691896257), (8, 0.5773502691896257)]
[(0, 0.6282580468670046), (1, 0.6282580468670046), (5, 0.45889394536615247)]

正常向量表达形式:
[0.6282580468670046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6282580468670046, 0.0, 0.0, 0.0, 0.45889394536615247, 0.0]
[0.0, 0.0, 0.0, 0.5773502691896257, 0.5773502691896257, 0.0, 0.0, 0.0, 0.5773502691896257, 0.0, 0.0, 0.0]
[0.6282580468670046, 0.6282580468670046, 0.0, 0.0, 0.0, 0.45889394536615247, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


# 三、LDA Model

In [12]:
# 模型构建
model = LdaModel(common_corpus, num_topics=10, random_state=0)

In [13]:
# 模型保存
model.save('./datas/lda_model.pkl')

In [14]:
# 模型加载
lda = LdaModel.load('./datas/lda_model.pkl')

In [18]:
# 模型结果获取
vectors = lda[other_corpus]
print("稀疏表达形式:")
for vector in vectors:
    print(vector)

print("\n正常向量表达形式:")
ndim = model.num_topics
for vector in vectors:
    print(sparse_2_norm_vector(vector, ndim=ndim))

稀疏表达形式:
[(0, 0.41142505), (1, 0.025000041), (2, 0.38856864), (3, 0.025000043), (4, 0.025000047), (5, 0.025000047), (6, 0.025006022), (7, 0.025000047), (8, 0.025000047), (9, 0.025000041)]
[(0, 0.025000062), (1, 0.025011465), (2, 0.5249875), (3, 0.025000066), (4, 0.02500007), (5, 0.02500007), (6, 0.025012966), (7, 0.02500007), (8, 0.02500007), (9, 0.27498767)]
[(0, 0.34045696), (1, 0.4595319), (2, 0.025008548), (3, 0.025000054), (4, 0.025000058), (5, 0.025000058), (6, 0.025000054), (7, 0.025000058), (8, 0.025000058), (9, 0.025002237)]

正常向量表达形式:
[0.41107586, 0.025000043, 0.3889178, 0.025000045, 0.025000049, 0.025000049, 0.02500604, 0.025000049, 0.025000049, 0.025000043]
[0.025000062, 0.025012575, 0.52498734, 0.025000066, 0.02500007, 0.02500007, 0.025013164, 0.02500007, 0.02500007, 0.27498654]
[0.34064132, 0.45934758, 0.025008533, 0.025000056, 0.02500006, 0.02500006, 0.025000056, 0.02500006, 0.02500006, 0.025002243]


In [19]:
# 更新模型（在当前模型基础上继续更新模型参数<不能增加单词数目，也就是字典不能变>）
lda.update(other_corpus)

In [20]:
# 更新后模型结果获取
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.4935978), (1, 0.025000028), (2, 0.3064001), (3, 0.02500003), (4, 0.025000032), (5, 0.025000032), (6, 0.025001919), (7, 0.025000032), (8, 0.025000032), (9, 0.025000028)]
[(0, 0.025000026), (1, 0.025000902), (2, 0.524994), (3, 0.025000028), (4, 0.02500003), (5, 0.02500003), (6, 0.025002683), (7, 0.02500003), (8, 0.02500003), (9, 0.27500224)]
[(0, 0.30352688), (1, 0.49646953), (2, 0.025002938), (3, 0.02500002), (4, 0.02500002), (5, 0.02500002), (6, 0.025000019), (7, 0.02500002), (8, 0.02500002), (9, 0.025000548)]


# 四、Other

官网文档：https://radimrehurek.com/gensim/apiref.html