In [331]:
import pymc as pm
import numpy as np
import copy 

import os
import wikipedia
import nltk


ModuleNotFoundError: No module named 'wikipedia'

In [326]:
docs = [["aaa", "bbb", "aaa"], 
        ["bbb", "aaa", "bbb"], 
        ["aaa", "bbb", "bbb", "aaa"], 
        ["uuu", "vvv"], 
        ["uuu", "vvv", "vvv"], 
        ["uuu", "vvv", "vvv", "uuu"]]

new_doc = ["aaa", "bbb", "aaa", "bbb", "uuu"]

doc_test = "What is the meaning of all this?"

**NLP MAGIC**

We are going to apply some NLP magic, such as stemming, lemming and removing stopwords. This will greatly help us in better identifying the underlying topic structure.

In [324]:
def clear_data(doc):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    only_words = tokenizer.tokenizer(doc)
    print(only_words)

In [327]:
clear_data(doc_test)

NameError: name 'nltk' is not defined

In [311]:
class LDA():
    def __init__(self, docs, K, alpha_scalar, beta_scalar):
        
        # K is the number of topics
        self.K = K
        
        self.initialize_vocabulary(docs)
        
        self.alpha = np.ones(self.K) * alpha_scalar
        self.beta = np.ones(self.V) * beta_scalar
        
    def initialize_vocabulary(self, docs):
        self.vocabulary = self.create_vocabulary(docs)
        self.index_data = self.replace_words_with_index(docs, self.vocabulary)
        
        # M is the number of docs
        self.M = len(docs)
        
        # number of unique words
        self.V = len(self.vocabulary)
        
        
    
    def initialize_distributions(self):
        # Theta is the document distribution over the topic space
        self.theta_i = np.empty(self.M, dtype=object)
        self.theta = np.empty(self.M, dtype=object)
        for i in range(self.M):
            self.theta_i[i] = pm.Dirichlet("theta_i_%i" % i, theta = self.alpha)
            
        self.theta_i = pm.Container(self.theta_i)
        
        for i in range(self.M):
            self.theta[i] = pm.CompletedDirichlet("theta_%i" % i, self.theta_i[i])
        
        self.theta = pm.Container(self.theta)
        
        # Phi is the word distribution for the topic space
        self.phi_i = np.empty(self.K, dtype=object)
        self.phi = np.empty(self.K, dtype=object)
        
        
        for i in range(self.K):
            self.phi_i[i] = pm.Dirichlet("phi_i_%i" % i, theta = self.beta)
            
        self.phi_i = pm.Container(self.phi_i)
        
        for i in range(self.K):
            self.phi[i] = pm.CompletedDirichlet("phi_%i" % i, self.phi_i[i])
        
        self.phi = pm.Container(self.phi)
        
        # Zeeta is the word-topic mapping        
        self.zeeta = np.empty(self.M, dtype=object)
        for i in range(self.M):
            self.zeeta[i] = pm.Categorical("zeeta_%i" % i, p = self.theta[i], size = len(self.index_data[i]))
            
        self.zeeta = pm.Container(self.zeeta)
        
        
        # W is the 
        self.w = pm.Container(
            [                             # document d, word i
                pm.Categorical("w_%i_%i" % (d, i), 
                              p = pm.Lambda("phi_z_%i%i" % (d, i), 
                                           lambda z = self.zeeta[d][i], phi = self.phi : phi[z]
                                           ),
                               value = self.index_data[d][i],
                               observed = True
                              )
                for d in range(self.M) for i in range(len((self.index_data[d])))
                    
            ]
        )
        
        self.model = pm.Model([self.theta_i, self.theta, self.phi_i, self.phi, self.zeeta, self.w])
        self.mcmc = pm.MCMC(self.model)
        
    
    def get_params(self):
        theta_mean = []
        for i in range(self.M):
            theta_mean.append(self.mcmc.trace("theta_%i" % i)[:].mean(axis=0))
        
        phi_mean = []
        
        for i in range(self.K):
            phi_mean.append(self.mcmc.trace("phi_%i" % i)[:].mean(axis=0))
        
        return theta_mean, phi_mean
    
    def print_zeeta(self, size):
        for i in range(self.M):
            print("Doc_%i" % i)
            print(self.mcmc.trace("zeeta_%i" % i)[0:size])
        
    def create_vocabulary(self, data):
        index = 0
        vocabulary = {}
        for document in data:
            for word in document:
                if (word not in vocabulary.keys()):
                    vocabulary[word] = index
                    index += 1
        return vocabulary

    
    def replace_words_with_index(self, data, vocabulary):
        # damn it, python!
        index_data = copy.deepcopy(data)
        for doc_index, document in enumerate(data):
            for word_index, word in enumerate(document):
                index_data[doc_index][word_index] = vocabulary[word]

        return index_data

**Task 1**
- Build the observed variable
- Inferd the hidden topic structure
- Trace also z (this is quite hard to print in a useful manner - what did you have in mind?)

In [312]:
LDA = LDA(docs, 2, 0.5, 0.5)

In [313]:
LDA.initialize_distributions()

In [314]:
LDA.mcmc.sample(10000,2000)

 [-----------------100%-----------------] 10000 of 10000 complete in 13.1 sec

In [315]:
(theta_mean, phi_mean) = LDA.get_params()

In [316]:
print("Theta mean:")
for item in theta_mean:
    print(item)
    
print("Phi mean:")
for item in phi_mean:
    print(item)
    
print("Zeeta")
LDA.print_zeeta(2)

Theta mean:
[[0.737883 0.262117]]
[[0.84397158 0.15602842]]
[[0.86136923 0.13863077]]
[[0.19049577 0.80950423]]
[[0.1585645 0.8414355]]
[[0.10338018 0.89661982]]
Phi mean:
[[0.45263899 0.46864792 0.0172616  0.06145149]]
[[0.10632954 0.05417827 0.39990943 0.43958276]]
Zeeta
Doc_0
[[0 0 0]
 [0 0 0]]
Doc_1
[[0 0 0]
 [0 0 0]]
Doc_2
[[0 0 0 0]
 [0 0 0 0]]
Doc_3
[[1 1]
 [1 1]]
Doc_4
[[1 1 1]
 [1 1 1]]
Doc_5
[[1 1 1 1]
 [1 1 1 1]]


**Can the topic model be used to define a topic-based similarity measure between documents?**

In order to mesaure the topic-based similarity between two documents, we have to measure the difference between their probability distributions. For this, we can use an f-divergence. Many common divergences, such as KL-Divergence, Hellinger distance and total variation distance are special cases of f-divergence, coinciding with a particular choice of f.

More info can be found here: https://en.wikipedia.org/wiki/F-divergence

I chose the Hellinger distance, as it seems to be the most popular one. https://en.wikipedia.org/wiki/Hellinger_distance

In [None]:
def calculate_Hellinger_distance(LDA):
    

**What about new documents? How can topics be assigned to it?**

The easiest way to add new documents and assing topics to add the new document to the document
list and run inference on them.


In [320]:
docs.append(new_doc)
LDA.initialize_vocabulary(docs)
LDA.mcmc.sample(2000, 1000)

 [-----------------100%-----------------] 2000 of 2000 complete in 2.7 sec