In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# using corpus from tutorial 1

from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('tmp/deerwester.dict')
corpus = corpora.MmCorpus('tmp/deerwester.mm')
print(corpus)

MmCorpus(9 documents, 12 features, 28 non-zero entries)


In [4]:
# Goal: transform from one vector representation to another
# This process serves two goals:
# 1. To bring out hidden structure in the corpus - discover relationship b/w words
#    and use them to describe the documents in a new and (hopefully) more semantic way
# 2. To make document representation more compact - improves efficiency (new representation
#    consumes less resources) & efficacy (marginal data trends are ignored, noise-reduction)

# creating a transformation - std python objects initialized by means of a training corpus
tfidf = models.TfidfModel(corpus) # step 1: initialize a model

In [6]:
# Transforming vectors
# Term Frequency* Inverse Document Frequency
# tfidf - treated as a read-only object to convert any vector from old representation
# (bow integer counts) to the new representation (TfIdf real-valued weights):
doc_bow = [(0,1), (1,1)]
print(tfidf[doc_bow]) # step 2: use the model to transform vectors

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [9]:
# or to apply a transformation to a whole corpus
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (5, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [10]:
# Transformations can also be serialized, on top of another, in a sort of chain
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

#transformed our Tf-Idf corpus via Latent Semantic Indexing into a latent 2-D space (2-D because we set num_topics=2)

In [11]:
lsi.print_topics(2)
# According to LSI, "trees", "graphs", "minors" are all related words

[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [12]:
# both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
for doc in corpus_lsi:
    print(doc)

[(0, 0.066007833960904816), (1, -0.52007033063618446)]
[(0, 0.19667592859142757), (1, -0.76095631677000519)]
[(0, 0.089926399724465672), (1, -0.72418606267525032)]
[(0, 0.075858476521782681), (1, -0.63205515860034223)]
[(0, 0.10150299184980338), (1, -0.5737308483002963)]
[(0, 0.70321089393783076), (1, 0.16115180214026009)]
[(0, 0.87747876731198271), (1, 0.1675890686465964)]
[(0, 0.90986246868185749), (1, 0.14086553628719223)]
[(0, 0.6165825350569285), (1, -0.053929075663892836)]


In [13]:
# Model persistency is achieved with the save() and load() functions:
lsi.save('tmp/model.lsi') # same for tfidf, lda, ..
lsi = models.LsiModel.load('tmp/model.lsi')