In [2]:
# 安装方式: pip install gensim -i https://pypi.tuna.tsinghua.edu.cn/simple/
import numpy as np
import gensim
from gensim.models import TfidfModel
from gensim.corpora import Dictionary # 字典，构建单词和序号id之间的映射关系

# 一、加载数据(数据预处理)

In [3]:
# 加载数据
with open('./datas/text8', 'r', encoding='utf-8') as reader:
    content = reader.read()
# 划分单词，并转换为二进制形式
words = list(map(lambda word: word.encode("utf-8"), filter(lambda t: t.strip(), content.split(" "))))
total_words = len(words)
print("总单词数目:{}".format(total_words))
print("【前10个单词】:{}".format(words[:10]))
# 在Gensim中进行TFIDF的转换，必须将其转换为文档的形式
# 将其转换为文档的形式(必须， 也就是一个文档存在多个单词)
# 假设每个文档10000个单词(实际情况下，每个文档的单词数目是不一致的)
word_per_doc = 10000
docs = []
for i in range(total_words // word_per_doc + 1):
    # 获取索引
    start_idx = i * word_per_doc
    end_idx = start_idx + word_per_doc
    # 获取对应的单词列表
    tmp_words = words[start_idx:end_idx]
    # 保存
    if len(tmp_words) > 0:
        docs.append(tmp_words)
print("总文档数目:{}".format(len(docs)))

总单词数目:17005207
【前10个单词】:[b'anarchism', b'originated', b'as', b'a', b'term', b'of', b'abuse', b'first', b'used', b'against']
总文档数目:1701


# 二、构建词典

In [5]:
# 词典构建以及词袋法转换数据
t_docs = [
    ['我', '是', '来自', '湖南', '张家界', '的', '小明'],  # 第一个文档
    ['张家界', '张家界', '天门山', '是', '一个', '非常', '不错', '的', '旅游景点'],  # 第二个文档
    ['小明', '非常', '喜欢', '去', '张家界', '天门山', '游玩'] # 第三个文档
]
dct = Dictionary(t_docs)
print(dct.token2id) # 单词和idx之间的映射字典
# dct.id2token # idx和单词之间的映射字典
corpus = [dct.doc2bow(line) for line in t_docs] # 词袋法
print(corpus)

# 数据转换为最终的特征属性矩阵
tf_result = np.zeros((len(t_docs), len(dct.token2id)))
for line_idx, line in enumerate(corpus):
    for idx, value in line:
        tf_result[line_idx][idx] = value
print(tf_result)

{'小明': 0, '张家界': 1, '我': 2, '是': 3, '来自': 4, '湖南': 5, '的': 6, '一个': 7, '不错': 8, '天门山': 9, '旅游景点': 10, '非常': 11, '去': 12, '喜欢': 13, '游玩': 14}
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(1, 2), (3, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)], [(0, 1), (1, 1), (9, 1), (11, 1), (12, 1), (13, 1), (14, 1)]]
[[1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1.]]


In [6]:
# 构建词典
# docs中必须是文档，文档内必须是一个一个的单词
# eg: docs --> [['a', 'bv', 'c'], ['a', 'c'], ['d', 'f', 'f']]
dct = Dictionary(docs)

In [7]:
print("总单词数目(去重后):{}".format(len(dct.token2id)))

总单词数目(去重后):253854


# 三、BOW词袋法转换

In [8]:
# 做一个词袋法转换(以dct中找到的单词作为特征属性，以文本中出现的数量作为特征值)
corpus = [dct.doc2bow(line) for line in docs]

# 四、TF-IDF构建

In [9]:
model = TfidfModel(corpus=corpus) # TFIDF模型构建

# 五、TF-IDF应用

In [10]:
# 二元组的第一个元素表示位置，第二个元素表示出现的次数
print("第一个文本的词袋法结构:")
print(corpus[0])

第一个文本的词袋法结构:
[(0, 184), (1, 1), (2, 1), (3, 3), (4, 7), (5, 1), (6, 1), (7, 1), (8, 12), (9, 2), (10, 2), (11, 2), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 2), (18, 2), (19, 2), (20, 1), (21, 1), (22, 1), (23, 3), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2), (32, 2), (33, 9), (34, 3), (35, 4), (36, 1), (37, 1), (38, 7), (39, 4), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1), (47, 3), (48, 2), (49, 6), (50, 3), (51, 3), (52, 1), (53, 2), (54, 1), (55, 1), (56, 1), (57, 5), (58, 1), (59, 16), (60, 7), (61, 2), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 4), (69, 3), (70, 1), (71, 2), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 14), (78, 2), (79, 1), (80, 2), (81, 1), (82, 2), (83, 2), (84, 2), (85, 2), (86, 3), (87, 1), (88, 32), (89, 1), (90, 1), (91, 1), (92, 1), (93, 13), (94, 1), (95, 2), (96, 3), (97, 1), (98, 6), (99, 1), (100, 1), (101, 3), (102, 1), (103, 1), (104, 1), (105, 44), (106, 1), (107, 3), (108, 9)

In [11]:
print("第一个文本的TFIDF的结果:")
print("维度大小:{}".format(np.shape(model[corpus[0]])))
model[corpus[0]]

第一个文本的TFIDF的结果:
维度大小:(2505, 2)


[(1, 0.006704047545684609),
 (2, 0.0030255603220721273),
 (3, 0.003156168449586299),
 (4, 0.0036673470201144674),
 (5, 0.004575122435127926),
 (6, 0.0028052608258295926),
 (7, 0.004064820137019515),
 (8, 0.00014963587508918375),
 (9, 0.0007492665180478759),
 (10, 0.004142807322609117),
 (11, 0.004149816941645728),
 (12, 0.0077498817493309525),
 (13, 0.00656024165742503),
 (14, 0.003891486499758776),
 (15, 0.005476877392392166),
 (16, 0.0018233938817994433),
 (17, 0.0032209070754237084),
 (18, 0.0017737283389229173),
 (19, 0.0023373507198140124),
 (20, 0.003725514968930464),
 (21, 0.00590342512385848),
 (22, 0.003072401062545206),
 (23, 0.0006668171096292247),
 (24, 0.0017594266221832493),
 (25, 0.004202080158963513),
 (26, 0.002967397324595724),
 (27, 0.004709756138185673),
 (28, 0.0014819657487289912),
 (29, 0.0031562459553171694),
 (30, 0.0031999829254611097),
 (31, 0.001215574949729317),
 (32, 0.003843126241898761),
 (33, 0.006499414537896336),
 (34, 0.004546489373863172),
 (35, 0.0

In [12]:
# 针对其它字符串进行词向量转换
others = [
    ['my', 'name','name', 'is', 'gerry'],
    ['my', 'name', 'is', 'xiaoming']
]
other_corpus = [dct.doc2bow(line) for line in others] # 词袋法
vectors = model[other_corpus] # 使用训练好的TFIDF转换数据
for vector in vectors:
    print(vector)

[(1215, 0.00015939590460450057), (1480, 0.17345090437798794), (1485, 0.024112029486193044), (19266, 0.9845472910924394)]
[(1215, 0.0009167561950501072), (1480, 0.9975927017705476), (1485, 0.0693394615801007)]


In [13]:
# 词袋法的结果
other_corpus

[[(1215, 1), (1480, 1), (1485, 2), (19266, 1)],
 [(1215, 1), (1480, 1), (1485, 1)]]

In [17]:
# 下标对应的单词
dct[19266]

'is'

In [20]:
# 单词对应的下标
dct.token2id['gerry']

19266

# 六、模型持久化以及恢复

In [28]:
# 模型持久化
fname = "./datas/tf_idf.model.pkl"
model.save(fname_or_handle=fname)
dct.save("./datas/dictionary.model.pkl") # 词典以二进制的方式保存
dct.save_as_text("./datas/dictionary.model.txt") # 词典以文本文件的方式存储

In [29]:
# 模型恢复
fname = "./datas/tf_idf.model.pkl"
re_dct = Dictionary.load("./datas/dictionary.model.pkl") # 字典加载
re_dct = Dictionary.load_from_text("./datas/dictionary.model.txt") # 文本方式加载词典
re_model = TfidfModel.load(fname) # 加载

In [30]:
# 使用恢复的模型对数据进行转换操作，针对其它字符串进行词向量转换
others = [
    ['my', 'name','name', 'is', 'gerry'],
    ['my', 'name', 'is', 'xiaoming']
]
other_corpus = [re_dct.doc2bow(line) for line in others] # 词袋法
vectors = re_model[other_corpus]
for vector in vectors:
    print(vector)

[(1215, 0.00015939590460450057), (1480, 0.17345090437798794), (1485, 0.024112029486193044), (19266, 0.9845472910924394)]
[(1215, 0.0009167561950501072), (1480, 0.9975927017705476), (1485, 0.0693394615801007)]
