# Topic Modeling

Each word in each document is drawn from one of the topics, where the selected topic is chosen from the per-document distribution over the topic.

In [2]:
from gensim import corpora

In [3]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

## Text preprocessing

Converting the given set of documents of words to a suitable bag of words format.

In [4]:
# Lets assume only set of stop words are 'for a of the and to in'
stopwords = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stopwords] for document in documents]

#Removing words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for document in texts:
	for token in document:
		frequency[token] += 1

texts = [[token for token in document if frequency[token] > 1] for document in texts]

texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [5]:
dictionary = corpora.Dictionary(texts)
dictionary.save("deerwester.dict")
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [6]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [7]:
new_doc = "Human computer interaction COMPuter"
new_vec = dictionary.doc2bow(new_doc.lower().split()) # to bag of words representation [similar as above achieved through pre-processing]
print(new_vec) #list of tuples -> id, frequency of the word in the new doc

[(0, 2), (1, 1)]


In [8]:
corpus = [dictionary.doc2bow(document) for document in texts]
corpora.MmCorpus.serialize('deerwester.mm', corpus)
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

# LDA

Latent Dirichlet Allocation Model for topic modeling

In [9]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
import numpy as np

In [10]:
# Assume tiny corpus with preprocessed data
texts = [['bank','river','shore','water'],
        ['river','water','flow','fast','tree'],
        ['bank','water','fall','flow'],
        ['bank','bank','water','rain','river'],
        ['river','water','mud','tree'],
        ['money','transaction','bank','finance'],
        ['bank','borrow','money'], 
        ['bank','finance'],
        ['finance','money','sell','bank'],
        ['borrow','sell'],
        ['bank','loan','sell']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(0, 1), (3, 1), (5, 1), (7, 1)],
 [(0, 2), (1, 1), (3, 1), (8, 1)],
 [(1, 1), (3, 1), (6, 1), (9, 1)],
 [(0, 1), (10, 1), (11, 1), (12, 1)],
 [(0, 1), (11, 1), (13, 1)],
 [(0, 1), (10, 1)],
 [(0, 1), (10, 1), (11, 1), (14, 1)],
 [(13, 1), (14, 1)],
 [(0, 1), (14, 1), (15, 1)]]

In [11]:
np.random.seed(1) #setting seed as 1 to produce same random results all the time
model = LdaModel(corpus, id2word=dictionary, num_topics=2)

In [12]:
model.show_topics(num_words=20)

[(0,
  '0.207*"bank" + 0.100*"water" + 0.089*"river" + 0.088*"sell" + 0.067*"borrow" + 0.064*"finance" + 0.062*"money" + 0.053*"tree" + 0.045*"flow" + 0.044*"rain" + 0.042*"fast" + 0.038*"loan" + 0.033*"shore" + 0.025*"mud" + 0.022*"fall" + 0.021*"transaction"'),
 (1,
  '0.142*"bank" + 0.116*"water" + 0.090*"river" + 0.084*"money" + 0.081*"finance" + 0.064*"flow" + 0.055*"transaction" + 0.055*"tree" + 0.053*"fall" + 0.050*"mud" + 0.050*"sell" + 0.039*"shore" + 0.036*"borrow" + 0.033*"loan" + 0.028*"fast" + 0.025*"rain"')]

In [13]:
model.get_term_topics("water")

[(0, 0.086268626), (1, 0.098355845)]

In [14]:
bow_water = ['bank','water','bank']
bow_finance = ['bank','finance','bank']

bow = model.id2word.doc2bow(bow_water) # convert to bag of words format first
doc_topics, word_topics, phi_values = model.get_document_topics(bow, per_word_topics=True)

word_topics

[(0, [0, 1]), (3, [0, 1])]

Now what does that output mean? It means that like `word_type 1`, our `word_type 3`, which is the word bank, is more likely to be in `topic_0` than `topic_1`.

You must have noticed that while we unpacked into `doc_topics` and `word_topics`, there is another variable - `phi_values`. Like the name suggests, phi_values contains the phi values for each topic for that particular word, scaled by feature length. Phi is essentially the probability of that word in that document belonging to a particular topic. The next few lines should illustrate this.

In [15]:
phi_values

[(0, [(0, 1.8300905), (1, 0.16990817)]),
 (3, [(0, 0.8581231), (1, 0.14187525)])]

This means that `word_type 0` has the following `phi_values` for each of the topics. What is interesting to note is `word_type 3` - because it has 2 occurrences (i.e, the word bank appears twice in the bow), we can see that the scaling by feature length is very evident. The sum of the phi_values is 2, and not 1.

In [16]:
bow = model.id2word.doc2bow(bow_finance) # convert to bag of words format first
doc_topics, word_topics, phi_values = model.get_document_topics(bow, per_word_topics=True)

word_topics

[(0, [0, 1]), (10, [0, 1])]

In [17]:
all_topics = model.get_document_topics(corpus, per_word_topics=True)

for doc_topics, word_topics, phi_values in all_topics:
    print('New Document \n')
    print('Document topics:', doc_topics)
    print('Word topics:', word_topics)
    print('Phi values:', phi_values)
    print(" ")
    print('-------------- \n')

New Document 

Document topics: [(0, 0.73633283), (1, 0.26366717)]
Word topics: [(0, [0, 1]), (1, [0, 1]), (2, [0, 1]), (3, [0, 1])]
Phi values: [(0, [(0, 0.8527052), (1, 0.14729404)]), (1, [(0, 0.7954733), (1, 0.20452493)]), (2, [(0, 0.77095807), (1, 0.22903538)]), (3, [(0, 0.7647523), (1, 0.23524614)])]
 
-------------- 

New Document 

Document topics: [(0, 0.75393915), (1, 0.24606086)]
Word topics: [(1, [0, 1]), (3, [0, 1]), (4, [0, 1]), (5, [0, 1]), (6, [0, 1])]
Phi values: [(1, [(0, 0.8067087), (1, 0.1932895)]), (3, [(0, 0.77720284), (1, 0.22279577)]), (4, [(0, 0.9071241), (1, 0.09287066)]), (5, [(0, 0.72941846), (1, 0.2705778)]), (6, [(0, 0.80554974), (1, 0.19444677)])]
 
-------------- 

New Document 

Document topics: [(0, 0.1703983), (1, 0.8296017)]
Word topics: [(0, [1, 0]), (3, [1, 0]), (5, [1, 0]), (7, [1, 0])]
Phi values: [(0, [(0, 0.15414658), (1, 0.84585243)]), (3, [(0, 0.09283457), (1, 0.90716404)]), (5, [(0, 0.07328669), (1, 0.9267104)]), (7, [(0, 0.031027097), (1, 0.