In [3]:
from nltk.corpus import brown

In [23]:
data = []

for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)

NO_DOCUMENTS = len(data)

In [25]:
print(NO_DOCUMENTS)
data[0]

500




In [5]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

In [6]:
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [7]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.005*"one" + 0.005*"would" + 0.003*"could" + 0.003*"said" + 0.003*"may" + 0.003*"two" + 0.003*"first" + 0.002*"man" + 0.002*"even" + 0.002*"time"
Topic #1: 0.005*"one" + 0.004*"would" + 0.003*"first" + 0.003*"could" + 0.003*"time" + 0.003*"like" + 0.002*"new" + 0.002*"even" + 0.002*"state" + 0.002*"may"
Topic #2: 0.006*"one" + 0.005*"said" + 0.004*"would" + 0.003*"new" + 0.003*"could" + 0.003*"first" + 0.002*"time" + 0.002*"man" + 0.002*"many" + 0.002*"like"
Topic #3: 0.005*"one" + 0.005*"would" + 0.004*"said" + 0.004*"time" + 0.004*"could" + 0.003*"new" + 0.003*"two" + 0.003*"man" + 0.003*"may" + 0.002*"also"
Topic #4: 0.005*"would" + 0.005*"one" + 0.003*"could" + 0.003*"said" + 0.002*"like" + 0.002*"new" + 0.002*"man" + 0.002*"first" + 0.002*"two" + 0.002*"people"
Topic #5: 0.007*"one" + 0.004*"would" + 0.003*"could" + 0.003*"new" + 0.003*"time" + 0.003*"said" + 0.002*"made" + 0.002*"two" + 0.002*"like" + 0.002*"first"
Topic #6: 0.007*"one" + 0.005*"would" + 0.0

In [8]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))

In [12]:
lsi_model[bow]

[(0, 0.09161351260282195),
 (1, 0.008819727618577222),
 (2, -0.016118265372387402),
 (3, 0.04131788800760472),
 (4, -0.01733679755339647),
 (5, -0.01348466567999982),
 (6, -0.028928239998879016),
 (7, -0.02051340226344353),
 (8, 0.055182751211272946),
 (9, 0.026334911722331957)]

In [10]:
lda_model[bow]

[(0, 0.02000558),
 (1, 0.020005524),
 (2, 0.8199463),
 (3, 0.020006185),
 (4, 0.020006342),
 (5, 0.020005887),
 (6, 0.020006057),
 (7, 0.020007392),
 (8, 0.02000546),
 (9, 0.020005269)]

In [18]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print('======')
print(similarity, document_id)
data[document_id]

[(162, 0.9979093), (199, 0.9978355), (441, 0.99783456), (438, 0.9976915), (404, 0.99766356), (260, 0.9976427), (176, 0.9976212), (36, 0.99760944), (156, 0.99760485), (15, 0.99760133)]
0.9979093 162


"In Ireland's County Limerick , near the River Shannon , there is a quiet little suburb by the name of Garryowen , which means `` Garden of Owen '' . Undoubtedly none of the residents realize the influence their town has had on American military history , or the deeds of valor that have been done in its name . The cry `` Garryowen '' ! ! Bursting from the lips of a charging cavalry trooper was the last sound heard on this earth by untold numbers of Cheyennes , Sioux and Apaches , Mexican banditos under Pancho Villa , Japanese in the South Pacific , and Chinese and North Korean Communists in Korea . Garryowen is the battle cry of the 7th U.S. Cavalry Regiment , `` The Fighting Seventh '' . Today a battle cry may seem an anachronism , for in the modern Army , esprit de corps has been sacrificed to organizational charts and tables . But don't tell that to a veteran of the Fighting Seventh , especially in a saloon on Saturday night . Of all the thousands of men who have served in the 7th C