# LDA for Dummies

Import numpy to handle the matrices:

In [1]:
import numpy as np 

Generate a toy corpus. Each document is a string:

In [10]:
docs = [  'europe brussels brexit britain great again ukip',
          'brussels belgium leave europe',
          'europe eu britain brexit transport',
          'farage eu ukip brussels',
          'europe brexit great',
          'transport infrastructure cars pollution',
          'cars bikes cars',
          'cars pollution transport minister bikes',
          'bikes infrastructure transport europe',
          'pollution cars minister']
docs = [doc.split(' ') for doc in docs]

Set some parameters:

In [11]:
K = 2 # number of topics
alpha = 1 # hyperparameter. single value indicates symmetric dirichlet prior. higher=>scatters document clusters
beta = 0.001 # hyperparameter
iterations = 100 # iterations for collapsed gibbs sampling.  This should be a lot higher than 3 in practice.

Assign a word ID to each unique word:

In [12]:
wordIDs = {}
currentID = 0
for doc in docs:
	for i in range(len(doc)):
		if doc[i] not in wordIDs:
			wordIDs[doc[i]] = currentID
			currentID +=1
vocab = list(range(len(wordIDs)))

Randomly assign topics to words in each document:

In [13]:
Cwt = np.zeros((K, len(vocab))) # initialize word-topic count matrix. wt refers to dimensions W * T
ta = [np.zeros((len(doc))) for doc in docs] # initialize topic assignment list

In [14]:
for d in range(len(docs)): # for each document
	for w in range(len(docs[d])): # for each token in document d 
		ta[d][w] = np.random.randint(K)
		ti = int(ta[d][w]) # topic index
		wi = wordIDs[docs[d][w]] # wordID for token w
		Cwt[ti,wi] +=1 # update word-topic matrix

Generate word-topic count matrix.

In [15]:
Cdt = np.zeros((len(docs), K)) # Document-topic matrix. dt refers to dimensions D * T
for d in range(len(docs)): # for each document d
	for t in range(K): # for each topic K
		for thing in ta[d]:
			if t == thing:
				Cdt[d,t] += 1 

Gibbs sampling:

In [18]:
for i in range(iterations): # for each pass through the corpus
	for d in range(len(docs)): # for each document
		for w in range(len(docs[d])): # for each token
			t0 = int(ta[d][w]) # initial topic assignment to token w
			wid = wordIDs[docs[d][w]] # wordID of token w

			Cdt[d][t0] = Cdt[d][t0]-1 # we don't want to include token w in our document-topic count matrix when sampling for token w
			Cwt[t0,wid] = Cwt[t0,wid]-1 # we don't want to include token w in our word-topic count matrix when sampling for token w

            ## UPDATE TOPIC ASSIGNMENT FOR EACH WORD -- COLLAPSED GIBBS SAMPLING MAGIC.  Where the magic happens.
			denom_a = sum(Cdt[d]) + K * alpha # number of tokens in document + number topics * alpha
			denom_b = np.sum(Cwt, axis=1) + len(vocab) * beta # number of tokens in each topic + # of words in vocab * beta

			p_z = (Cwt[:,wid] + beta) / denom_b * (Cdt[d,:] + alpha) / denom_a # calculating probability word belongs to each topic
			t1 = np.random.choice(range(K), size=1, replace=True, p=p_z/sum(p_z)) # draw topic for word n from multinomial using probabilities calculated above


			ta[d][w] = t1 # update topic assignment list with newly sampled topic for token w."""
			Cdt[d,t1] = Cdt[d,t1]+1 # re-increment document-topic matrix with new topic assignment for token w.
			Cwt[t1,wid] = Cwt[t1,wid]+1 # re-increment word-topic matrix with new topic assignment for token w.

Print out matrices:

In [19]:
print(Cwt, '\n\n', ta, '\n\n', Cdt)

[[ 0.  3.  3.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  5.  3.  3.  2.]
 [ 5.  0.  0.  2.  2.  1.  2.  1.  1.  2.  4.  0.  2.  0.  0.  0.  0.]] 

 [array([ 1.,  0.,  0.,  1.,  1.,  1.,  1.]), array([ 0.,  1.,  1.,  1.]), array([ 1.,  1.,  1.,  0.,  1.]), array([ 0.,  1.,  1.,  0.]), array([ 1.,  0.,  1.]), array([ 1.,  1.,  0.,  0.]), array([ 0.,  0.,  0.]), array([ 0.,  0.,  1.,  0.,  0.]), array([ 0.,  1.,  1.,  1.]), array([ 0.,  0.,  0.])] 

 [[ 2.  5.]
 [ 1.  3.]
 [ 1.  4.]
 [ 2.  2.]
 [ 1.  2.]
 [ 2.  2.]
 [ 3.  0.]
 [ 4.  1.]
 [ 1.  3.]
 [ 3.  0.]]
