# config

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Generate model

In [2]:
from gensim import corpora
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

2017-12-10 09:34:12,541 : INFO : 'pattern' package not found; tag filters are not available for English


In [3]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

In [4]:
dictionary = corpora.Dictionary(texts)
dictionary.save('temp\deerwester.dict')  # store the dictionary, for future reference
print(dictionary)

2017-12-10 09:34:14,152 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-12-10 09:34:14,155 : INFO : built Dictionary(12 unique tokens: ['user', 'human', 'survey', 'response', 'system']...) from 9 documents (total 29 corpus positions)
2017-12-10 09:34:14,157 : INFO : saving Dictionary object under temp\deerwester.dict, separately None
2017-12-10 09:34:14,167 : INFO : saved temp\deerwester.dict


Dictionary(12 unique tokens: ['user', 'human', 'survey', 'response', 'system']...)


In [5]:
print(dictionary.token2id)

{'user': 4, 'human': 1, 'survey': 6, 'response': 7, 'system': 5, 'computer': 2, 'eps': 8, 'trees': 9, 'interface': 0, 'minors': 11, 'time': 3, 'graph': 10}


In [6]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
new_vec

[(1, 1), (2, 1)]

In [7]:
texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [8]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('temp\deerwester.mm', corpus)  # store to disk, for later use
corpus

2017-12-10 09:34:16,265 : INFO : storing corpus in Matrix Market format to temp\deerwester.mm
2017-12-10 09:34:16,267 : INFO : saving sparse matrix to temp\deerwester.mm
2017-12-10 09:34:16,269 : INFO : PROGRESS: saving document #0
2017-12-10 09:34:16,271 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2017-12-10 09:34:16,281 : INFO : saving MmCorpus index to temp\deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(0, 1), (4, 1), (5, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (4, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(6, 1), (10, 1), (11, 1)]]

# TFIDF

In [9]:
import os

In [10]:
from gensim import corpora, models, similarities
if (os.path.exists("temp\deerwester.dict")):
    dictionary = corpora.Dictionary.load('temp\deerwester.dict')
    corpus = corpora.MmCorpus('temp\deerwester.mm')
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

2017-12-10 09:34:18,841 : INFO : loading Dictionary object from temp\deerwester.dict
2017-12-10 09:34:18,844 : INFO : loaded temp\deerwester.dict
2017-12-10 09:34:18,845 : INFO : loaded corpus index from temp\deerwester.mm.index
2017-12-10 09:34:18,847 : INFO : initializing corpus reader from temp\deerwester.mm
2017-12-10 09:34:18,849 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


Used files generated from first tutorial


In [11]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

2017-12-10 09:34:19,302 : INFO : collecting document frequencies
2017-12-10 09:34:19,305 : INFO : PROGRESS: processing document #0
2017-12-10 09:34:19,306 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(2, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.3244870206138555), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.44424552527467476)]
[(0, 0.5710059809418182), (4, 0.4170757362022777), (5, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.45889394536615247), (7, 0.6282580468670046)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(6, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [12]:
for doc in corpus_tfidf:
    doc = [(dictionary.get(idx), tfidf) for idx, tfidf in doc]
    print(doc)

[('interface', 0.5773502691896257), ('human', 0.5773502691896257), ('computer', 0.5773502691896257)]
[('computer', 0.44424552527467476), ('time', 0.44424552527467476), ('user', 0.3244870206138555), ('system', 0.3244870206138555), ('survey', 0.44424552527467476), ('response', 0.44424552527467476)]
[('interface', 0.5710059809418182), ('user', 0.4170757362022777), ('system', 0.4170757362022777), ('eps', 0.5710059809418182)]
[('human', 0.49182558987264147), ('system', 0.7184811607083769), ('eps', 0.49182558987264147)]
[('time', 0.6282580468670046), ('user', 0.45889394536615247), ('response', 0.6282580468670046)]
[('trees', 1.0)]
[('trees', 0.7071067811865475), ('graph', 0.7071067811865475)]
[('trees', 0.5080429008916749), ('graph', 0.5080429008916749), ('minors', 0.695546419520037)]
[('survey', 0.6282580468670046), ('graph', 0.45889394536615247), ('minors', 0.6282580468670046)]


# Cosine Similarity

In [13]:
index = similarities.MatrixSimilarity(corpus_tfidf)

2017-12-10 09:34:20,686 : INFO : creating matrix with 9 documents and 12 features


In [14]:
import numpy as np
np.array(index)

array([[ 0.99999994,  0.25648525,  0.32967046,  0.28395563,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.25648525,  1.        ,  0.27067134,  0.23313783,  0.70710683,
         0.        ,  0.        ,  0.        ,  0.27910084],
       [ 0.32967046,  0.27067134,  1.        ,  0.58049643,  0.19139352,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.28395563,  0.23313783,  0.58049643,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710683,  0.19139352,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.70710677,  0.50804287,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.70710677,  0.99999994,  0.71848112,  0.32448703],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0