In [None]:
pip install pymc



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pymc as pm
import re
import numpy as np
import spacy
from scipy.stats import wasserstein_distance
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#class used for preprocessing the text using spacy library
class PreprocessData:
  def __init__(self, documents):
    #build the dictionary and the features
    self.nlp = spacy.load('en_core_web_sm')
    self.features = self.build_features(documents)
    self.voc = self.build_voc(documents)

  def preprocess(self, document):
    #preprocessing by eliminating stopwords, punctuation and words that are not alpha
    #lowercase
    document = document.lower()
    doc = self.nlp(document)

    preprocessed_words = []

    for token in doc:
      #eliminate punctuation
      if token.is_punct:
        continue
        #eliminate stop words
      if token.is_stop:
        continue
      word = token.text
      if word.isalpha():
        #add only alpha words
        preprocessed_words.append(token.lemma_)
      
    return preprocessed_words
  
  def build_voc(self, documents):
    #create a list with unique words from the dataset - vocabulary 
    voc = []
    for doc in documents:
      preprocessed_doc = self.preprocess(doc)
      for word in preprocessed_doc:
        if word not in voc:
          voc.append(word)
    return voc
  
  def build_features(self, documents):
    #build features based on vocabulary
    features = []
    for doc in documents:
      preprocessed_doc = self.preprocess(doc)
      features.append(preprocessed_doc)
    
    return features


  def build_features_encoded(self):
    #build features that are encoded by numbers
    documents = []
    for doc in self.features:
      doc_step = []
      for word in doc:
        doc_step.append(self.voc.index(word))
      documents.append(doc_step)
    
    return documents
  
  def build_features_new_topic(self, doc):
    #preprocessing a new document in order to assign a new topic to it
    #select only the words that are already in the vocabulary
    #return encoded features
    prep_doc = self.preprocess(doc)
    features = []
    for word in prep_doc:
      if word in self.voc:
        features.append(self.voc.index(word))
    return features
                

In [None]:
class LDA(object):
    def __init__(self, data, k, a=1, b=1, iter=4000, burnin=None):
        #preprocess the documents from the data
        #build vocabulary and the features
        self.prep = PreprocessData(data)
        self.data = self.prep.features
        self.vocabulary = self.prep.voc
        self.documents =  self.prep.build_features_encoded()


        self.K = k #number of topics
        self.V = len(self.vocabulary) #len of vocabulary
        self.alpha = np.zeros(self.K) + a #self.alpha is a array of number of topics
        self.beta = np.zeros(self.V) + b #self.betha is a array of len of vocabulary
        
        self.M = len(self.data) # number of documents
        self.N = [] # lens of each document
        for doc in self.data:
          self.N.append(len(doc))

        self.iter = iter # number iterations
        if burnin is None:
          self.burnin = self.iter/5 #if user does not provide burinin value, it is equal to iterations/5
        else:
          self.burnin = burnin
        

    def compileModel(self):
    
        #create the variables described in the assignment 

        #create two lists of Dirichlet and CompletedDirichlet having size equal to size of beta
        #we create the both of them because they have to be passed to the pymc model 
        #we can't pass only the CompletDirichlet because it can't make the right graph for Diriclet
        prior_phi = []
        phi = []
        for k in range(self.K):
          aux_prior_phi = pm.Dirichlet(f'prior_phi_{k}', self.beta) 
          aux_phi = pm.CompletedDirichlet(f'phi_{k}', aux_prior_phi)

          prior_phi.append(aux_prior_phi)
          phi.append(aux_phi)
        
        #transform the two lists in containers
        self.prior_phi = pm.Container(prior_phi)
        self.phi = pm.Container(phi)

        #create two lists of Dirichlet and CompletedDirichlet having size equal to size of alpha
        #we create the both of them because they have to be passed to the pymc model 
        #we can't pass only the CompletDirichlet because it can't make the right graph for Diriclet
        prior_theta = []
        theta = []
        for m in range(self.M):
          aux_prior_theta = pm.Dirichlet(f'prior_theta_{m}', self.alpha)
          aux_theta = pm.CompletedDirichlet(f'theta_{m}', aux_prior_theta) 

          prior_theta.append(aux_prior_theta)
          theta.append(aux_theta)
        
        #transform the two lists in containers
        self.prior_theta = pm.Container(prior_theta)
        self.theta = pm.Container(theta)


        #for each word create a categorical/multinoulli variable with probability=theta[i][j]
        z = []
        for m in range(self.M):
          z_step = pm.Categorical(f"z_{m}", p=self.theta[m], size=self.N[m], value=np.random.randint(self.K, size=self.N[m])) 
          z.append(z_step)
        
        #trasnform the list in container
        self.z = pm.Container(z)

        #for each word(param value - the actual word in the dataset) create a categorical/multinoulli variable with probability=theta[i][j]
        #pm.Lambda gets the value phi[z[i][j]]
        w = []
        for m in range(self.M):
          for n in range(self.N[m]):
            aux_p = p=pm.Lambda(f"phi_{m}_{n}", lambda z=self.z[m][n],phi=self.phi:phi[z])
            w_step = pm.Categorical(f"w_{m}_{n}", p=aux_p,value=self.documents[m][n], observed=True, verbose=False)
            w.append(w_step)

        #trasnform the list in container
        self.w = pm.Container(w)


        #create the model by adding the variables
        self.model = pm.Model([self.prior_phi, self.prior_theta, self.phi, self.theta, self.z, self.w])
        self.mcmc = pm.MCMC(self.model)
        #sampling
        self.mcmc.sample(self.iter,self.burnin, thin=1)
    
    def get_theta_trace(self):
      #get theta trace in order to calculate distances for similarity
      self.theta_trace = np.array([self.mcmc.trace(f'theta_{m}')[:].squeeze(axis=1) for m in range(self.M)])
      self.theta_trace = self.theta_trace.mean(axis=1)
      return self.theta_trace
    
    def similarity_wasserstein(self):
      #get theta trace that describes how each document was assigned over the number of topics
      theta_trace = self.get_theta_trace()

      #calculate distances between every two documents from the corpus
      #i used Wasserstein distance which is a metric of the distance between two probability distributions 
      #showing the cost required to convert one prob distribution to other
      distances = []
      for i in range(self.M):
        aux = []
        for j in range(self.M):
          dist = wasserstein_distance(theta_trace[i], theta_trace[j])
          aux.append(round(dist, 4))
        distances.append(aux)

      #create dataframe with all costs
      df = pd.DataFrame(distances)
      return df 

    def write_theta(self):
      #print theta
        print('\nTheta values:')
        for m in range(self.M):
          theta = self.mcmc.trace(f"theta_{m}")[:]
          mean_theta = theta.mean(axis = 0)
          print(mean_theta)
    
    def write_phi(self):
      #print phi
        print('\nPhi values:')
        for k in range(self.K):
          phi = self.mcmc.trace(f"phi_{k}")[:]
          mean_phi = phi.mean(axis=0)
          print(mean_phi)
  
    def write_z(self):
      #print z
        print('\nZ values')
        for m in range(self.M):
          z = self.mcmc.trace(f"z_{m}")[:]
          mean_z = z.mean(axis=0)
          mean_z = np.round(mean_z)
          print(mean_z)

    
    def importantWords(self):
      #get the important words for each topic
        print()
        phi = []
        for k in range(self.K):
          phi_step = self.mcmc.trace(f"phi_{k}")[:]
          mean_phi = phi_step.mean(axis=0)
          phi.append(mean_phi)
        
        #create a dictionary of lists for each topic
        topics = {}
        for k in range(self.K):
          topics['topic{}'.format(k)] = []

        #get the max value of phi for each word in order to decide the topic it came from
        w_topics = np.argmax(phi, axis=0)[0]

        #select the words - each word appear in exact one topic
        for i in range(len(w_topics)):
          topic = w_topics[i]
          topics['topic{}'.format(topic)].append(self.vocabulary[i])

        print(topics)

                
    def assignNewTopic(self, document):
        
        #get the features of the new topic by preprocessing
        #unseen words are skipped
        features = self.prep.build_features_new_topic(document)
        phi = []
        for k in range(self.K):
          phi_aux = self.mcmc.trace(f"phi_{k}")[:].mean(axis=0)
          phi.append(phi_aux)

        #for each topic we calculate the sum of probabilities of each word
        
        probs = []
        for k in range(self.K):
            prob = 0
            for i in features:
                prob += phi[k][0][i]
            probs.append(prob)
        
        #we get the maximum sum of probabilities as the topic assigned
        max_index = np.argsort(probs)[-1]
        print("The topic of the document is {}".format(np.argsort(probs)[-1]))

In [None]:
sanity_set = ["aaa bbb aaa",
       "bbb aaa bbb",
        "aaa bbb bbb aaa",
        "uuu vvv",
        "uuu vvv vvv",
        "uuu vvv vvv uuu"]


lda1 = LDA(data=sanity_set, k=2, a=0.75, b=0.75, iter=5000)
lda1.compileModel()



 [-----------------100%-----------------] 5000 of 5000 complete in 12.0 sec

In [None]:
print(lda1.vocabulary)
print(lda1.documents)

['aaa', 'bbb', 'uuu', 'vvv']
[[0, 1, 0], [1, 0, 1], [0, 1, 1, 0], [2, 3], [2, 3, 3], [2, 3, 3, 2]]


In [None]:
lda1.importantWords()
lda1.write_phi()
lda1.write_theta()
lda1.write_z()



{'topic0': ['uuu', 'vvv'], 'topic1': ['aaa', 'bbb']}

Phi values:
[[0.16281904 0.14961242 0.29643547 0.39113306]]
[[0.34412789 0.40397325 0.07798848 0.17391037]]

Theta values:
[[0.38182754 0.61817246]]
[[0.29628673 0.70371327]]
[[0.36633143 0.63366857]]
[[0.66934224 0.33065776]]
[[0.67594053 0.32405947]]
[[0.77095658 0.22904342]]

Z values
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1. 1.]
[0. 0.]
[0. 0. 0.]
[0. 0. 0. 0.]


In [None]:
lda1.assignNewTopic('vvv aaa vvv ccc')

The topic of the document is 0


In [None]:
df = lda1.similarity_wasserstein()
df

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0855,0.0155,0.0512,0.0578,0.1528
1,0.0855,0.0,0.07,0.0344,0.0278,0.0672
2,0.0155,0.07,0.0,0.0357,0.0423,0.1373
3,0.0512,0.0344,0.0357,0.0,0.0066,0.1016
4,0.0578,0.0278,0.0423,0.0066,0.0,0.095
5,0.1528,0.0672,0.1373,0.1016,0.095,0.0


In [None]:
doc_example = ["I had a peanuts butter sandwich's for breakfast.",
             "I like to eat almonds, peanuts and walnuts.",
             "My neighbor got a little dog yesterday.",
             "Cats and dogs are mortal enemies.",
             "You mustn’t feed peanuts to your dog."]

lda2 = LDA(data=doc_example, k=2, a=0.9, b=0.9, iter=25000)
lda2.compileModel()




 [-----------------100%-----------------] 25000 of 25000 complete in 51.6 sec

In [None]:
lda2.importantWords()
lda2.write_phi()
lda2.write_theta()
lda2.write_z()


{'topic0': ['walnut', 'get', 'little', 'dog', 'cat', 'mortal', 'enemy', 'feed'], 'topic1': ['peanut', 'butter', 'sandwich', 'breakfast', 'like', 'eat', 'almond', 'neighbor', 'yesterday']}

Phi values:
[[0.0668639  0.05130069 0.01135669 0.05811762 0.04728713 0.06002659
  0.0001884  0.05770363 0.05373482 0.0737371  0.0500709  0.15043482
  0.04290314 0.06772404 0.0974209  0.04807588 0.06305376]]
[[0.12948841 0.06748316 0.06976224 0.05889419 0.06169349 0.06792733
  0.05769963 0.04168542 0.05512199 0.04819847 0.03933749 0.06626592
  0.06449506 0.01091803 0.05551492 0.04567504 0.05983922]]

Theta values:
[[0.33910173 0.66089827]]
[[0.32252454 0.67747546]]
[[0.56903778 0.43096222]]
[[0.69734108 0.30265892]]
[[0.4793777 0.5206223]]

Z values
[1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0.]
[1. 1. 0.]


In [None]:
lda2.assignNewTopic('I eat two sandwiches at breakfast and play with my dog')

The topic of the document is 0


In [None]:
corpus = ['If the skill is being used in practical way, people will consider it a craft instead of art.',
          'Likewise, if the design skill is being used in a commercial or industrial way, it may be considered commercial art instead of fine art.',
          'It is used to say crafts and design are sometimes considered applied art.',
          'Honey bees are known to fly through many chemicals and odors, as is common in insects.',
          'A honey bee (also spelled honeybee) is known as a common eusocial flying specie insect within the genus Apis of the bee odors clade.',
          'Bees are flying insects closely related to wasps in the case of the best known bee species, the western honey bee, for producing honey.',
          'A computer is a modern machine that can be instructed to carry out sequences of arithmetic or logical operations automatically via computer programming.',
          'These programs enable modern computers to perform an extremely wide range of operations that include arithmetic or logical statements.',
          'Modern computers have the ability to follow generalized sets of logical operations, called programs in order to carry out a wide number of tasks.'
]

lda3 = LDA(data=corpus, k=3, a=0.85, b=0.85, iter=15000)
lda3.compileModel()



 [-----------------100%-----------------] 15000 of 15000 complete in 132.8 sec

In [None]:
print(lda3.data)

[['skill', 'practical', 'way', 'people', 'consider', 'craft', 'instead', 'art'], ['likewise', 'design', 'skill', 'commercial', 'industrial', 'way', 'consider', 'commercial', 'art', 'instead', 'fine', 'art'], ['craft', 'design', 'consider', 'apply', 'art'], ['honey', 'bee', 'know', 'fly', 'chemical', 'odor', 'common', 'insect'], ['honey', 'bee', 'spell', 'honeybee', 'know', 'common', 'eusocial', 'fly', 'specie', 'insect', 'genus', 'api', 'bee', 'odor', 'clade'], ['bee', 'fly', 'insect', 'closely', 'relate', 'wasp', 'case', 'best', 'know', 'bee', 'specie', 'western', 'honey', 'bee', 'produce', 'honey'], ['computer', 'modern', 'machine', 'instruct', 'carry', 'sequence', 'arithmetic', 'logical', 'operation', 'automatically', 'computer', 'programming'], ['program', 'enable', 'modern', 'computer', 'perform', 'extremely', 'wide', 'range', 'operation', 'include', 'arithmetic', 'logical', 'statement'], ['modern', 'computer', 'ability', 'follow', 'generalized', 'set', 'logical', 'operation', 'ca

In [None]:
lda3.write_phi()
lda3.write_theta()
lda3.write_z()


Phi values:
[[5.90106361e-02 3.93439747e-03 2.11982605e-02 1.19206203e-02
  2.57856536e-03 5.37911582e-03 1.31335028e-03 6.16188457e-04
  6.26791375e-03 1.37607411e-02 3.58481711e-03 2.90827523e-03
  1.22303657e-02 1.26194886e-02 5.12756166e-02 8.89845046e-04
  5.00610228e-02 4.22379632e-02 3.60275933e-02 5.44859983e-02
  3.43764588e-02 3.42361720e-02 4.09555389e-02 5.57503235e-02
  3.91221304e-02 1.12671560e-02 2.03686040e-02 2.25306474e-02
  2.69137996e-02 2.94970562e-02 4.66484749e-04 1.81078183e-02
  7.33684311e-03 3.04223021e-02 8.07091305e-03 6.57618456e-03
  8.56659160e-05 4.71516834e-03 1.92678718e-02 1.07869110e-02
  1.77969350e-02 3.19747279e-03 1.02574841e-02 1.97216471e-02
  4.79551759e-04 2.36380882e-02 1.41061639e-02 2.43696584e-03
  4.46153223e-03 3.51844179e-03 1.03132752e-02 4.93758253e-03
  8.04151950e-03 1.49082097e-03 4.81569379e-03 1.15396742e-02
  7.14733302e-05 9.93514028e-03 1.20090027e-02 1.42952434e-03
  7.33329505e-03 6.96969644e-04 1.46169209e-02]]
[[3.6022

In [None]:
lda3.assignNewTopic('The computer I received from my family is great. Now I can do all the arithmetic operations faster for homework')

The topic of the document is 1
