In [2]:
import re
import datetime

In [1]:
from gensim import corpora
from gensim import models
from gensim import similarities
from pprint import pprint  # pretty-printer

In [3]:
from nltk import PorterStemmer
from nltk.corpus import stopwords

In [4]:
init_t: datetime = datetime.datetime.now()

In [5]:
documents1 = [
    "Human machine survey computer interface interface eps time for lab abc computer applications user",
    "A survey of user opinion of computer system user response time computer user interface interface",
    "The EPS user users interfaces interface human interface computer human management system user",
    "System and human interface interface engineering testing of EPS computer user",
    "Relation of users perceived response time to error measurement trees",
    "The generation of random binary unordered paths minors user user computer",
    "The intersection graph of paths in trees paths trees",
    "Graph minors IV Widths of trees and well quasi ordering graph paths",
    "Graph minors A tree paths binary trees graphs",
]

another corpus (example in slide)

In [6]:
documents = ["eat turkey on turkey day holiday",
              "i like to eat cake on holiday",
              "turkey trot race on thanksgiving holiday",
              "snail race the turtle",
              "time travel space race",
              "movie on thanksgiving",
              "movie at air and space museum is cool movie",
              "aspiring movie star"]

In [7]:
porter = PorterStemmer()

remove common words and tokenize

In [8]:
stoplist = stopwords.words('english')
texts = [
    [porter.stem(word) for word in document.lower().split() if word not in stoplist]
    for document in documents
]
texts

[['eat', 'turkey', 'turkey', 'day', 'holiday'],
 ['like', 'eat', 'cake', 'holiday'],
 ['turkey', 'trot', 'race', 'thanksgiv', 'holiday'],
 ['snail', 'race', 'turtl'],
 ['time', 'travel', 'space', 'race'],
 ['movi', 'thanksgiv'],
 ['movi', 'air', 'space', 'museum', 'cool', 'movi'],
 ['aspir', 'movi', 'star']]

create mapping keyword-id

In [9]:
dictionary = corpora.Dictionary(texts)

In [10]:
print()
print("Mapping keyword-id:")
pprint(dictionary.token2id)


Mapping keyword-id:
{'air': 15,
 'aspir': 18,
 'cake': 4,
 'cool': 16,
 'day': 0,
 'eat': 1,
 'holiday': 2,
 'like': 5,
 'movi': 14,
 'museum': 17,
 'race': 6,
 'snail': 9,
 'space': 11,
 'star': 19,
 'thanksgiv': 7,
 'time': 12,
 'travel': 13,
 'trot': 8,
 'turkey': 3,
 'turtl': 10}


In [11]:
id2token = dict(dictionary.items())

create the vector for each doc

In [12]:
model_bow = [dictionary.doc2bow(text) for text in texts]

create the LDA model from bow vectors

In [13]:
lda = models.LdaModel(model_bow, num_topics=2, id2word=dictionary, random_state=30)
# random_state: forced to always obtain the same results in all the executions
lda_vectors = []
for v in model_bow:
    lda_vectors.append(lda[v])

In [14]:
print()
print("LDA vectors for docs (in terms of topics):")
i = 0
for v in lda_vectors:
    print(v, documents[i])
    i += 1


LDA vectors for docs (in terms of topics):
[(0, 0.89354247), (1, 0.1064575)] eat turkey on turkey day holiday
[(0, 0.88823485), (1, 0.1117652)] i like to eat cake on holiday
[(0, 0.19518845), (1, 0.80481154)] turkey trot race on thanksgiving holiday
[(0, 0.1410767), (1, 0.8589233)] snail race the turtle
[(0, 0.1557929), (1, 0.84420705)] time travel space race
[(0, 0.20982431), (1, 0.7901757)] movie on thanksgiving
[(0, 0.12486146), (1, 0.8751385)] movie at air and space museum is cool movie
[(0, 0.15395802), (1, 0.846042)] aspiring movie star


In [15]:
matrix_lda = similarities.MatrixSimilarity(lda_vectors)
print()
print("Matrix similarities")
print(matrix_lda)


Matrix similarities
MatrixSimilarity<8 docs, 2 features>


In [16]:
def convert(match):
    return dictionary.id2token[int(match.group(0)[1:-1])]

In [17]:
print("LDA Topics:")
for t in lda.print_topics(num_words=30):
    print(re.sub('"[0-9]+"', convert, str(t)))

LDA Topics:
(0, '0.099*"holiday" + 0.090*"eat" + 0.088*"turkey" + 0.070*"movi" + 0.056*"race" + 0.055*"like" + 0.053*"cake" + 0.052*"space" + 0.048*"thanksgiv" + 0.045*"day" + 0.043*"air" + 0.041*"cool" + 0.040*"time" + 0.037*"museum" + 0.037*"travel" + 0.032*"star" + 0.032*"trot" + 0.031*"aspir" + 0.025*"turtl" + 0.025*"snail"')
(1, '0.119*"movi" + 0.095*"race" + 0.068*"turkey" + 0.066*"thanksgiv" + 0.062*"space" + 0.058*"holiday" + 0.050*"snail" + 0.050*"turtl" + 0.045*"aspir" + 0.044*"trot" + 0.044*"star" + 0.040*"travel" + 0.039*"museum" + 0.037*"time" + 0.036*"cool" + 0.034*"air" + 0.033*"day" + 0.029*"eat" + 0.026*"cake" + 0.024*"like"')


In [18]:
end_creation_model_t: datetime = datetime.datetime.now()

In [19]:
print()




obtain LDA vector for the following doc<br>
doc = "Human computer interaction"

In [20]:
doc = "trees graph human"
doc_s = [porter.stem(word) for word in doc.lower().split() if word not in stoplist]
doc_s

['tree', 'graph', 'human']

In [21]:
vec_bow = dictionary.doc2bow(doc_s)
vec_lda = lda[vec_bow]

calculate similarities between doc and each doc of texts using lda vectors and cosine

In [22]:
sims = matrix_lda[vec_lda]

sort similarities in descending order

In [23]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [24]:
print()
print("Given the doc: " + doc)
print("whose LDA vector is: " + str(vec_lda))
print()
print("The Similarities between this doc and the documents of the corpus are:")
for doc_position, doc_score in sims:
    print(doc_score, documents[doc_position])


Given the doc: trees graph human
whose LDA vector is: [(0, 0.5), (1, 0.5)]

The Similarities between this doc and the documents of the corpus are:
0.8648992 movie on thanksgiving
0.85384667 turkey trot race on thanksgiving holiday
0.82369024 time travel space race
0.82227826 aspiring movie star
0.812363 snail race the turtle
0.7998936 movie at air and space museum is cool movie
0.7898527 i like to eat cake on holiday
0.7857948 eat turkey on turkey day holiday


In [25]:
end_t: datetime = datetime.datetime.now()

get execution time

In [26]:
elapsed_time_model_creation: datetime = end_creation_model_t - init_t
elapsed_time_comparison: datetime = end_t - end_creation_model_t
print()
print('Execution time model:', elapsed_time_model_creation, 'seconds')
print('Execution time comparison:', elapsed_time_comparison, 'seconds')


Execution time model: 0:00:27.097211 seconds
Execution time comparison: 0:00:10.055980 seconds
