Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora. 
Target audience is the natural language processing (NLP) and information retrieval (IR) community.

https://radimrehurek.com/gensim/auto_examples/index.html

In [2]:
from gensim import corpora, models, similarities
from pprint import pprint

### 向量空间

#### dictionary和corpus

In [7]:
def GenDictandCorpus():
    documents = ["Human machine interface for lab abc computer applications",
                 "A survey of user opinion of computer system response time",
                 "The EPS user interface management system",
                 "System and human system engineering testing of EPS",
                 "Relation of user perceived response time to error measurement",
                 "The generation of random binary unordered trees",
                 "The intersection graph of paths in trees",
                 "Graph minors IV Widths of trees and well quasi ordering",
                 "Graph minors A survey"]

    texts = [[word for word in document.lower().split()] for document in documents]

    # 词典
    dictionary = corpora.Dictionary(texts)
    # 词库，以(词，词频)方式存贮   corpus将文本存贮成(词在词典中位置，词频)这种形式，每个文本为一行。
    corpus = [dictionary.doc2bow(text) for text in texts]
#     print(dictionary)
#     print(corpus)
    return dictionary, corpus
dictionary, corpus = GenDictandCorpus()
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]


#### TF-IDF

In [12]:
def Tfidf():
    dictionary, corpus = GenDictandCorpus()

    # initialize a model
    tfidf = models.TfidfModel(corpus)
    # print(tfidf)

    # Transforming vectors
    # 此时，tfidf被视为一个只读对象，可以用于将任何向量从旧表示（词频）转换为新表示（TfIdf实值权重）
    doc_bow = [(0, 1), (1, 1)]
    # 使用模型tfidf，将doc_bow(由词,词频)表示转换成(词,tfidf)表示
    # print(tfidf[doc_bow])

    # 转换整个词库
    corpus_tfidf = tfidf[corpus]
#     for doc in corpus_tfidf:
#         print(doc)
    return corpus_tfidf
tfidf = Tfidf()

#### LDA
https://radimrehurek.com/gensim/models/ldamodel.html

In [46]:
def LDA():
    dictionary, corpus = GenDictandCorpus()
#     corpus_tfidf = Tfidf()
    ldamodel = models.LdaModel(corpus, id2word=dictionary, num_topics=2)
#     ldamodel.print_topics()
    pprint(ldamodel.print_topics())
    
    return ldamodel
lda = LDA()

[(0,
  '0.096*"of" + 0.045*"system" + 0.044*"graph" + 0.044*"trees" + 0.036*"user" '
  '+ 0.035*"the" + 0.033*"response" + 0.033*"time" + 0.033*"a" + '
  '0.032*"minors"'),
 (1,
  '0.060*"interface" + 0.044*"system" + 0.043*"eps" + 0.041*"human" + '
  '0.038*"the" + 0.036*"user" + 0.036*"abc" + 0.036*"computer" + 0.036*"lab" + '
  '0.036*"management"')]


In [18]:
other_texts = [
    ['computer', 'time', 'graph'],
    ['survey', 'response', 'eps'],
    ['human', 'system', 'computer']
    ]
other_corpus = [dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[0]
vector = lda[unseen_doc]
vector

[(0, 0.14493868), (1, 0.8550613)]

### 潜在语义索引
(Latent Semantic Indexing,以下简称LSI)，有的文章也叫Latent Semantic  Analysis（LSA）
LSI是基于奇异值分解（SVD）的方法来得到文本的主题的

In [23]:
def LSI():
    dictionary, corpus = GenDictandCorpus()
    corpus_tfidf = Tfidf()

    # initialize an LSI transformation
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
    corpus_lsi = lsi[corpus_tfidf]
   # print(corpus_lsi)
    pprint(lsi.print_topics(2))
   # 在这里实际执行了bow-> tfidf和tfidf-> lsi转换
    for doc in corpus_lsi:
        print(doc)
    return lsi
    # lsi.save('/tmp/model.lsi')
    # lsi = models.LsiModel.load('/tmp/model.lsi')
lsi = LSI()

[(0,
  '0.331*"system" + 0.329*"a" + 0.329*"survey" + 0.241*"user" + 0.234*"minors" '
  '+ 0.217*"opinion" + 0.215*"eps" + 0.212*"graph" + 0.205*"response" + '
  '0.205*"time"'),
 (1,
  '0.330*"minors" + -0.313*"eps" + -0.301*"system" + 0.288*"graph" + 0.274*"a" '
  '+ 0.274*"survey" + -0.268*"management" + -0.262*"interface" + '
  '-0.208*"human" + -0.189*"testing"')]
[(0, 0.25053350643882555), (1, -0.36663044623753294)]
[(0, 0.7126217528750018), (1, 0.11068666534656395)]
[(0, 0.4918324800213249), (1, -0.5557409774957023)]
[(0, 0.45394917960264897), (1, -0.5034737767110555)]
[(0, 0.3371561211589134), (1, -0.02013888023727338)]
[(0, 0.1602526256061259), (1, 0.03339166766627265)]
[(0, 0.2620936417258959), (1, 0.22806265366020462)]
[(0, 0.32989319215286433), (1, 0.40778939730084907)]
[(0, 0.5563151509435186), (1, 0.5788024705523834)]


### 随机投影

In [25]:
# 随机投影(Random Projections)，RP旨在减少矢量空间维数。
# 这是非常有效的方法，通过投掷一点随机性来近似文档之间的TfIdf距离。
# 推荐的目标维度数百/千，取决于您的数据集。
def RP():
    corpus_tfidf = Tfidf()
    RP_model = models.RpModel(corpus_tfidf, num_topics=2)
   # print(RP_model)
    corpus_rp = RP_model[corpus_tfidf]
    for doc in corpus_rp:
        print(doc)
    return RP_model
rp = RP()

[(0, 0.29435908794403076), (1, -0.08813542127609253)]
[(0, 0.11182543635368347), (1, 0.14982929825782776)]
[(0, 0.21567857265472412), (1, -1.078392505645752)]
[(0, -0.6272317171096802), (1, -0.503377377986908)]
[(0, 0.1940629482269287), (1, 0.5821888446807861)]
[(0, -0.393359512090683), (1, 0.7254387140274048)]
[(0, -0.6123313307762146), (1, -0.841675341129303)]
[(0, 0.1240825355052948), (1, 0.05129450559616089)]
[(0, -0.8537415266036987), (1, -0.1014062762260437)]


## 相似性

In [27]:
from gensim import similarities

In [29]:
# transform corpus to space and index it 创建索引
index=similarities.MatrixSimilarity(lsi[corpus])

In [31]:
# search
doc="human computer interaction"  # query
vec_bow=dictionary.doc2bow(doc.lower().split())
vec_lsi=lsi[vec_bow]  #convert the query to LSI space

sims=index[vec_lsi]
sims=sorted(enumerate(sims),key=lambda item:-item[1])
sims

[(3, 0.99545485),
 (2, 0.991444),
 (0, 0.96337396),
 (4, 0.8171174),
 (1, 0.7188792),
 (5, 0.65916455),
 (6, 0.27311423),
 (8, 0.08344132),
 (7, 0.0708783)]

### Doc2Vec
In Gensim, we refer to the Paragraph Vector model as Doc2Vec

In [40]:
from gensim.models import doc2vec

In [41]:

train_corpus = []
# 使用count当做每个句子的“标签”，标签和每个句子是一一对应的
count = 0
documents = [
    ['computer', 'time', 'graph'],
    ['survey', 'response', 'eps'],
    ['human', 'system', 'computer']
    ]
for words in documents:
    # 切词，返回的结果是列表类型
#     words = segment(line)
    # 这里documents里的每个元素是二元组，具体可以查看函数文档
    train_corpus.append(doc2vec.TaggedDocument(words, [str(count)]))
    count += 1
#     if count % 10 == 0:
#         logging.info('{} has loaded...'.format(count))

model = doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)
# 模型训练
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [45]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.00875808 -0.00194624  0.00502243 -0.00473971 -0.00615937 -0.00218707
 -0.00687608  0.00440042  0.00566705 -0.00725835 -0.0080782  -0.00899301
  0.00635291 -0.00534122 -0.00673877  0.00519126 -0.00184545 -0.00062682
  0.00760184  0.0056194   0.00812614  0.00067861  0.00808792  0.00721509
 -0.00352193  0.00754973  0.00659988  0.00525765  0.00172301  0.00030624
 -0.00013697  0.00058433  0.00882175  0.00545469 -0.00337192  0.00120288
  0.00437443  0.0084803  -0.00418913 -0.00756418 -0.0009385  -0.00829048
 -0.00253594 -0.00590744  0.00769454 -0.00261715  0.00033818 -0.0016226
 -0.00664085  0.00788883]


### fasttext

In [47]:
from gensim.test.utils import common_texts
common_texts[0]

['human', 'interface', 'computer']

In [None]:
model = FastText(size=4, window=3, min_count=1)
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train

# # pass all the above parameters to the constructor to do everything in a single line:
# model2 = FastText(size=4, window=3, min_count=1, sentences=common_texts, iter=10)