In [1]:
import jieba
from gensim import corpora, models, similarities

In [106]:
l1 = ['所有者权益合计','股东权益合计','所有者权益（或股东权益)合计']
l1

['所有者权益合计', '股东权益合计', '所有者权益（或股东权益)合计']

In [107]:

all_doc_list = []  #
for doc in l1:
    # 利用jieba分词将语料库中的每一个问题切割
    doc_list = [word for word in jieba.cut(doc)]
    all_doc_list.append(doc_list)
print(all_doc_list)

[['所有者', '权益', '合计'], ['股东权益', '合计'], ['所有者', '权益', '（', '或', '股东权益', ')', '合计']]


In [108]:
# 用户问的问题
a = "归属于母公司所有者权益合计"
doc_test_list = [word for word in jieba.cut(a)]
print(doc_test_list)

['归属于', '母公司', '所有者', '权益', '合计']


In [109]:
# 制作语料库
dictionary = corpora.Dictionary(all_doc_list)  # 制作词袋
# 词袋:是根据当前所有的问题即列表all_doc_list中每一个列表中的每一个元素(就是字)为他们做一个唯一的标志,形成一个key:velue的字典
print("token2id", dictionary.token2id)

token2id {'合计': 0, '所有者': 1, '权益': 2, '股东权益': 3, ')': 4, '或': 5, '（': 6}


In [110]:
# 制作语料库
# 这里是将all_doc_list 中的每一个列表中的词语 与 dictionary 中的Key进行匹配
# doc2bow文本变成id,这个词在当前的列表中出现的次数
# ['你', '的', '名字', '是', '什么'] ==>(1,1),(4,1),(2,1),(3,1),(0,1)
# 1是你 1代表出现一次, 4是的  1代表出现了一次, 以此类推 2是名字 , 3是是,0是什么
corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
print("corpus", corpus, type(corpus))

corpus [[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]] <class 'list'>


In [111]:
# ['你', '今年', '多大', '了']  词袋中没有多大165
# 所以需要词向量化

# 将需要寻找相似度的分词列表 做成 语料库 doc_test_vec
# ['你', '今年', '多大', '了']  (1, 1), (5, 1), (6, 1)
doc_test_vec = dictionary.doc2bow(doc_test_list)
print("doc_test_vec", doc_test_vec, type(doc_test_vec))

doc_test_vec [(0, 1), (1, 1), (2, 1)] <class 'list'>


In [112]:
# 将corpus语料库(初识语料库) 使用Lsi模型进行训练,将语料库变成计算机可识别可读的数字
lsi = models.LsiModel(corpus)
print("lsi", lsi, type(lsi))

lsi LsiModel(num_terms=7, num_topics=200, decay=1.0, chunksize=20000) <class 'gensim.models.lsimodel.LsiModel'>


In [113]:
# 语料库corpus的训练结果
print("lsi[corpus]", lsi[corpus])

lsi[corpus] <gensim.interfaces.TransformedCorpus object at 0x7ff5f2ad9950>


In [114]:
# 将问题放到放到已经训练好的语料库模型一个一个匹配,获取匹配分值
# 获得语料库doc_test_vec 在 语料库corpus的训练结果 中的 向量表示
print("lsi[doc_test_vec]", lsi[doc_test_vec])

lsi[doc_test_vec] [(0, 1.3688940110590953), (1, -1.0079078146174605), (2, -0.33204069587867663)]


In [115]:
# lsi[corpus]==>Lsi训练好的语料库模型
# index是设定的匹配相识度的条件
index = similarities.SparseMatrixSimilarity(lsi[corpus], num_features=len(dictionary.keys()))
print("index", index, type(index))

index <gensim.similarities.docsim.SparseMatrixSimilarity object at 0x7ff5f70bd890> <class 'gensim.similarities.docsim.SparseMatrixSimilarity'>


In [116]:
# 将 语料库doc_test_vec 在 语料库corpus的训练结果 中的 向量表示 与 语料库corpus的 向量表示 做矩阵相似度计算
gongyinshi = lsi[doc_test_vec]
print(gongyinshi)
sim = index[gongyinshi]

print("sim", sim, type(sim))

[(0, 1.3688940110590953), (1, -1.0079078146174605), (2, -0.33204069587867663)]
sim [1.         0.40824825 0.65465367] <class 'numpy.ndarray'>


In [117]:
# 对下标和相似度结果进行一个排序,拿出相似度最高的结果
# cc = sorted(enumerate(sim), key=lambda item: item[1],reverse=True)
cc = sorted(enumerate(sim), key=lambda item: -item[1])
print(cc)

[(0, 1.0), (2, 0.65465367), (1, 0.40824825)]


In [118]:
text = l1[cc[0][0]]
if cc[0][1] > 0:
    print(a, text)

归属于母公司所有者权益合计 所有者权益合计


In [119]:
from collections import defaultdict
from gensim import corpora

In [120]:
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [121]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

In [122]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [124]:
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
print(frequency)

defaultdict(<class 'int'>, {'human': 2, 'machine': 1, 'interface': 2, 'lab': 1, 'abc': 1, 'computer': 2, 'applications': 1, 'survey': 2, 'user': 3, 'opinion': 1, 'system': 4, 'response': 2, 'time': 2, 'eps': 2, 'management': 1, 'engineering': 1, 'testing': 1, 'relation': 1, 'perceived': 1, 'error': 1, 'measurement': 1, 'generation': 1, 'random': 1, 'binary': 1, 'unordered': 1, 'trees': 3, 'intersection': 1, 'graph': 3, 'paths': 1, 'minors': 2, 'iv': 1, 'widths': 1, 'well': 1, 'quasi': 1, 'ordering': 1})


In [125]:
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

In [126]:
texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [127]:
dictionary = corpora.Dictionary(texts)

In [128]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7ff5f0afe490>

In [130]:
dictionary.token2id

{'computer': 0,
 'human': 1,
 'interface': 2,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'user': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}

In [131]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

In [136]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

In [138]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
print(vec_lsi)

[(0, 0.46182100453271613), (1, -0.07002766527900031)]


In [139]:
index = similarities.MatrixSimilarity(lsi[corpus])  # transform corpus to LSI space and index it

In [140]:
sims = index[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]


In [141]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for i, s in enumerate(sims):
    print(s, documents[i])

(2, 0.9984453) Human machine interface for lab abc computer applications
(0, 0.998093) A survey of user opinion of computer system response time
(3, 0.9865886) The EPS user interface management system
(1, 0.93748635) System and human system engineering testing of EPS
(4, 0.90755945) Relation of user perceived response time to error measurement
(8, 0.050041765) The generation of random binary unordered trees
(7, -0.09879464) The intersection graph of paths in trees
(6, -0.10639259) Graph minors IV Widths of trees and well quasi ordering
(5, -0.12416792) Graph minors A survey
