In [54]:
import datetime

In [55]:
from gensim import corpora
from gensim import models
from pprint import pprint  # pretty-printer
from gensim import similarities

In [56]:
import re

In [57]:
from nltk.corpus import stopwords
from nltk import PorterStemmer

In [58]:
init_t: datetime = datetime.datetime.now()  # init the time for the execution time calculation

In [59]:
documents = [
    "Human machine survey computer interface interface eps time for lab abc computer applications user",
    "A survey of user opinion of computer system user response time computer user interface interface",
    "The EPS user users interfaces interface human interface computer human management system user",
    "System and human interface interface engineering testing of EPS computer user",
    "Relation of users perceived response time to error measurement trees",
    "The generation of random binary unordered paths minors user user computer",
    "The intersection graph of paths in trees paths trees",
    "Graph minors IV Widths of trees and well quasi ordering graph paths",
    "Graph minors A tree paths binary trees graphs",
]

In [60]:
porter = PorterStemmer()

remove common words and tokenize

In [61]:
stoplist = stopwords.words('english')
texts = [
    [porter.stem(word) for word in document.lower().split() if word not in stoplist]
    for document in documents
]

In [62]:
print("Tokens of each document:")
pprint(texts)

Tokens of each document:
[['human',
  'machin',
  'survey',
  'comput',
  'interfac',
  'interfac',
  'ep',
  'time',
  'lab',
  'abc',
  'comput',
  'applic',
  'user'],
 ['survey',
  'user',
  'opinion',
  'comput',
  'system',
  'user',
  'respons',
  'time',
  'comput',
  'user',
  'interfac',
  'interfac'],
 ['ep',
  'user',
  'user',
  'interfac',
  'interfac',
  'human',
  'interfac',
  'comput',
  'human',
  'manag',
  'system',
  'user'],
 ['system',
  'human',
  'interfac',
  'interfac',
  'engin',
  'test',
  'ep',
  'comput',
  'user'],
 ['relat', 'user', 'perceiv', 'respons', 'time', 'error', 'measur', 'tree'],
 ['gener',
  'random',
  'binari',
  'unord',
  'path',
  'minor',
  'user',
  'user',
  'comput'],
 ['intersect', 'graph', 'path', 'tree', 'path', 'tree'],
 ['graph',
  'minor',
  'iv',
  'width',
  'tree',
  'well',
  'quasi',
  'order',
  'graph',
  'path'],
 ['graph', 'minor', 'tree', 'path', 'binari', 'tree', 'graph']]


create mapping keyword-id

In [63]:
dictionary = corpora.Dictionary(texts)

In [64]:
print()
print("Mapping keyword-id:")
pprint(dictionary.token2id)


Mapping keyword-id:
{'abc': 0,
 'applic': 1,
 'binari': 22,
 'comput': 2,
 'engin': 15,
 'ep': 3,
 'error': 17,
 'gener': 23,
 'graph': 28,
 'human': 4,
 'interfac': 5,
 'intersect': 29,
 'iv': 30,
 'lab': 6,
 'machin': 7,
 'manag': 14,
 'measur': 18,
 'minor': 24,
 'opinion': 11,
 'order': 31,
 'path': 25,
 'perceiv': 19,
 'quasi': 32,
 'random': 26,
 'relat': 20,
 'respons': 12,
 'survey': 8,
 'system': 13,
 'test': 16,
 'time': 9,
 'tree': 21,
 'unord': 27,
 'user': 10,
 'well': 33,
 'width': 34}


create the vector for each doc

In [65]:
model_bow = [dictionary.doc2bow(text) for text in texts]

create tfidf model

In [66]:
tfidf = models.TfidfModel(model_bow)
tfidf_vectors = tfidf[model_bow]

In [67]:
id2token = dict(dictionary.items())

In [68]:
def convert(match):
    return dictionary.id2token[int(match.group(0)[0:-1])]

In [69]:
print()
print("Vectors for documents (the positions with zeros are not shown):")
for doc in tfidf_vectors:
    print(re.sub("[0-9]+,", convert, str(doc)))


Vectors for documents (the positions with zeros are not shown):
[(abc 0.40542866711280406), (applic 0.40542866711280406), (comput 0.21691507236538704), (ep 0.20271433355640203), (human 0.20271433355640203), (interfac 0.29926331616103624), (lab 0.40542866711280406), (machin 0.40542866711280406), (survey 0.2775301625966611), (time 0.20271433355640203), (user 0.07481582904025906)]
[(comput 0.2829774401730927), (interfac 0.39040517665048174), (survey 0.36205310274675534), (time 0.2644518085841349), (user 0.2928038824878613), (opinion 0.5289036171682698), (respons 0.36205310274675534), (system 0.2644518085841349)]
[(comput 0.1320516506982879), (ep 0.24681329955011175), (human 0.4936265991002235), (interfac 0.5465486717208087), (user 0.27327433586040434), (system 0.24681329955011175), (manag 0.4936265991002235)]
[(comput 0.1450694414669421), (ep 0.2711444145000151), (human 0.2711444145000151), (interfac 0.4002853435089738), (user 0.10007133587724346), (system 0.2711444145000151), (engin 0.5

In [70]:
matrix_tfidf = similarities.MatrixSimilarity(tfidf_vectors)  # this matrix will be necessary to calculate similarity between documents

In [71]:
end_creation_model_t: datetime = datetime.datetime.now()  # just after the calculation of the matrix similarity -> time function

In [72]:
print()
print("Matrix similarities")
print(matrix_tfidf)


Matrix similarities
MatrixSimilarity<9 docs, 35 features>


obtain tfidf vector for the following doc

In [80]:
doc = "trees graph human"
doc_s = [porter.stem(word) for word in doc.lower().split() if word not in stoplist]

In [81]:
vec_bow = dictionary.doc2bow(doc_s)
vec_tfidf = tfidf[vec_bow]

calculate similarities between doc and each doc of texts using tfidf vectors and cosine

In [75]:
sims = matrix_tfidf[vec_tfidf]  # sims is a list a similarities

sort similarities in descending order

In [76]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [77]:
print()
print("Given the doc: " + doc)
print("whose tfidf vector is: " + str(vec_tfidf))
print()
print("The Similarities between this doc and the documents of the corpus are:")
for doc_position, doc_score in sims:
    print(doc_score, documents[doc_position])


Given the doc: trees graph human
whose tfidf vector is: [(4, 0.6268574434403934), (21, 0.46270886225222435), (28, 0.6268574434403934)]

The Similarities between this doc and the documents of the corpus are:
0.6251458 Graph minors A tree paths binary trees graphs
0.42819637 The intersection graph of paths in trees paths trees
0.31231773 Graph minors IV Widths of trees and well quasi ordering graph paths
0.30943352 The EPS user users interfaces interface human interface computer human management system user
0.1699689 System and human interface interface engineering testing of EPS computer user
0.12707299 Human machine survey computer interface interface eps time for lab abc computer applications user
0.07723485 Relation of users perceived response time to error measurement trees
0.0 A survey of user opinion of computer system user response time computer user interface interface
0.0 The generation of random binary unordered paths minors user user computer


In [78]:
end_t: datetime = datetime.datetime.now()  # to mark the end of the program

get execution time

In [79]:
elapsed_time_model_creation: datetime = end_creation_model_t - init_t
elapsed_time_comparison: datetime = end_t - end_creation_model_t
print()
print('Execution time model:', elapsed_time_model_creation, 'seconds')
print('Execution time comparison:', elapsed_time_comparison, 'seconds')


Execution time model: 0:00:00.249571 seconds
Execution time comparison: 0:00:00.109482 seconds
