# Using Gensim for Topic Modeling


In [1]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:1])

500


In [2]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords.
# Loop through each article in "data", clean the text, and tokenize
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [3]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.006*"would" + 0.005*"one" + 0.004*"said" + 0.003*"man" + 0.003*"first" + 0.003*"time" + 0.003*"even" + 0.002*"could" + 0.002*"like" + 0.002*"two"
Topic #1: 0.006*"one" + 0.005*"would" + 0.003*"said" + 0.002*"two" + 0.002*"first" + 0.002*"could" + 0.002*"even" + 0.002*"time" + 0.002*"like" + 0.002*"many"
Topic #2: 0.006*"one" + 0.004*"would" + 0.004*"said" + 0.004*"could" + 0.003*"new" + 0.003*"time" + 0.003*"man" + 0.003*"may" + 0.003*"back" + 0.002*"two"
Topic #3: 0.007*"one" + 0.005*"would" + 0.003*"may" + 0.003*"new" + 0.003*"like" + 0.003*"even" + 0.003*"could" + 0.003*"said" + 0.003*"time" + 0.002*"man"
Topic #4: 0.006*"one" + 0.005*"would" + 0.005*"could" + 0.003*"new" + 0.003*"said" + 0.003*"time" + 0.003*"made" + 0.003*"two" + 0.003*"like" + 0.002*"may"
Topic #5: 0.006*"one" + 0.004*"would" + 0.003*"could" + 0.003*"first" + 0.003*"new" + 0.003*"two" + 0.002*"like" + 0.002*"many" + 0.002*"may" + 0.002*"said"
Topic #6: 0.005*"would" + 0.004*"one" + 0.003*"n

In [4]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))
 
print(lsi_model[bow])
 
print(lda_model[bow])

# A higher number means a stronger topic representation in the text.

[(0, -0.09161471952089408), (1, -0.008782254457165925), (2, 0.016647141189534434), (3, -0.04113837665978207), (4, -0.015373807441144426), (5, -0.01242852340310437), (6, 0.02970666877392193), (7, 0.017709786066823906), (8, 0.05885988815681925), (9, -0.02363984825329518)]
[(0, 0.02001273), (1, 0.020012094), (2, 0.020012347), (3, 0.81988513), (4, 0.020015262), (5, 0.020012345), (6, 0.020013463), (7, 0.020012164), (8, 0.020012202), (9, 0.020012237)]


In [5]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])

[(49, 0.9978277), (286, 0.99762076), (329, 0.9975964), (176, 0.9974461), (85, 0.99732953), (273, 0.99732953), (297, 0.99732953), (301, 0.99732953), (302, 0.99732953), (305, 0.99732953)]
The study of the St. Louis area's economic prospects prepared for the Construction Industry Joint Conference confirms and reinforces both the findings of the Metropolitan St. Louis Survey of 1957 and the easily observed picture of the Missouri-Illinois countryside . St. Louis sits in the center of a relatively slow-growing and in some places stagnant mid-continent region . Slackened regional demand for St. Louis goods and services reflects the region's relative lack of purchasing power . Not all St. Louis industries , of course , have a market area confined to the immediate neighborhood . But for those which do , the slow growth of the area has a retarding effect on the metropolitan core . The city has a stake in stimulating growth and purchasing power throughout outstate Missouri and Southern Illinois 

# Using Scikit-Learn for Topic Modeling


In [6]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
 
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

(500, 10)




(500, 10)
(500, 10)
[1.05620426e-04 1.05596659e-04 9.99049482e-01 1.05615452e-04
 1.05597171e-04 1.05615378e-04 1.05647466e-04 1.05619889e-04
 1.05608269e-04 1.05597185e-04]
[0.         0.         2.11904296 0.07690385 0.         0.5426182
 1.06761047 0.         0.         0.24642583]
[ 23.30684276   1.59527306  21.82111349  -0.04368105   0.82938678
  11.71793141   4.48208711  -0.79963766   0.79188486 -13.74734331]


In [7]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('used', 315.4289206780137), ('use', 235.84194611789135), ('time', 186.61150245896397), ('small', 173.55902319618693), ('state', 170.03214810741395), ('rate', 155.42757611292194), ('number', 153.37932570397058), ('cost', 148.64752427484214), ('new', 146.86693842923162), ('water', 141.64242897882406)]
Topic 1:
[('new', 0.43006929889847484), ('time', 0.32185642663431885), ('people', 0.301923715049103), ('mrs', 0.29799772042473305), ('years', 0.28775919962031804), ('god', 0.27260842198068275), ('american', 0.270286846489791), ('world', 0.2656942501882045), ('way', 0.2579092836279975), ('like', 0.25057321695927715)]
Topic 2:
[('new', 452.61433840858916), ('state', 393.3174445268265), ('said', 381.1231791103862), ('year', 345.48874333386505), ('years', 264.310119606956), ('school', 261.627247112802), ('states', 236.26682342633507), ('president', 203.30932818402474), ('mrs', 202.1634742863243), ('american', 199.67307857378262)]
Topic 3:
[('man', 358.4748022531354), ('new

In [8]:
# Transforming an unseen document

text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[0.0028982  0.         0.         0.         0.         0.00438561
 0.         0.         0.         0.00467117]


In [9]:
# Similarity

from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Livery stable -- J. Vernon , prop. '' . Coaching had declined considerably by 1905 , but the sign was still there , near the old Wells Fargo building in San Francisco , creaking in the fog as it had for thirty years . John Vernon had had all the patronage he cared for -- he had prospered , but he could not retire from horsedom . Coaching was in his blood . He had two interests in life : the pleasures of the table and driving . Twice a week he drove his tallyho over the Santa Cruz road , upland and through the redwood forest , with orchards below him at one hand , and glimpses of the Pacific at the other . The journey back he made along the coast road , traveling hell-for-leather , every lantern of the tallyho ablaze . The southward route was the classic run in California , and the most fashionable . His patronage on this stretch was made up largely of San Franciscans -- regulars , most of them , and trenchermen like himself . They did not complain at the inhuman hour of starting ( seve

# Plotting words and documents in 2D with SVD

In [10]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [11]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [12]:
# Closer documents should be similar.

print(data[9][:1000])
print(data[205][:1000])

Vincent G. Ierulli has been appointed temporary assistant district attorney , it was announced Monday by Charles E. Raymond , District Attorney . Ierulli will replace Desmond D. Connall who has been called to active military service but is expected back on the job by March 31 . Ierulli , 29 , has been practicing in Portland since November , 1959 . He is a graduate of Portland University and the Northwestern College of Law . He is married and the father of three children . Helping foreign countries to build a sound political structure is more important than aiding them economically , E. M. Martin , assistant secretary of state for economic affairs told members of the World Affairs Council Monday night . Martin , who has been in office in Washington , D. C. , for 13 months spoke at the council's annual meeting at the Multnomah Hotel . He told some 350 persons that the United States' challenge was to help countries build their own societies their own ways , following their own paths . `` 

In [13]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

# More about Latent Dirichlet Allocation


In [14]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

[0.77497641 0.02500885 0.0250044  0.02500002 0.02500036 0.02500072
 0.02500509 0.02500029 0.02500075 0.02500312] 0.9999999999999999


In [15]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  by='saliency', ascending=False).head(R).drop('saliency', 1)
