# **Sanity check**

---





In [1]:
!pip install pymc



In [2]:
sanity_check = [[0, 1, 0], [1, 0, 1], [0, 1, 1, 0], [2, 3], [2, 3, 3], [2, 3, 3, 2]]

data = sanity_check

In [3]:
import pymc as pm
import numpy as np

nr_topics = 2  
vocab_size = 4
corpus_size = len(data)

alpha = np.ones(nr_topics)*0.5
beta = np.ones(vocab_size)*0.5
Nm = [len(doc) for doc in data]

phi_ = pm.Container([pm.Dirichlet("phi_ %s" % topic, theta = beta) for topic in range(nr_topics)])

phi = pm.Container( [pm.CompletedDirichlet("Phi %s" % topic,  phi_[topic])  for topic in range(nr_topics)] ) #word distribution per topic

theta_ = pm.Container([pm.Dirichlet("theta_ %s" % doc, theta = alpha) for doc in range(corpus_size)])

theta = pm.Container([pm.CompletedDirichlet("Theta %s" % doc, theta_[doc]) for doc in range(corpus_size)])    # topic distribution per docs


z = pm.Container([pm.Categorical('Z %i' % doc,       # topic for word per docs
                             p = theta[doc], 
                             size = Nm[doc],
                             value = np.random.randint(nr_topics, size = Nm[doc]))
                for doc in range(corpus_size)])


w = pm.Container([pm.Categorical("W %i %i" % (doc, word),     # the word from doc
                                p = pm.Lambda('Phi Z %i %i' % (doc, word), 
                                             lambda z = z[doc][word], 
                                             phi = phi: phi[z]),
                                value = data[doc][word], 
                                observed = True)
                for doc in range(corpus_size)
                for word in range(Nm[doc]) ])


model = pm.Model([phi_, theta_, theta, phi, z, w])

map_ = pm.MAP(model) # improving convergence
map_.fit()

mcmc = pm.MCMC(model)    # fitting
tr = mcmc.sample(10000, 4000)


  import pandas.util.testing as tm






 [-----------------100%-----------------] 10000 of 10000 complete in 23.8 sec

In [4]:
print('Topic distribution for each word:\n')
for doc in range(corpus_size):  # topic distribution per word per document
    print(mcmc.trace('Z %i' % doc)[0]) 

Topic distribution for each word:

[0 0 0]
[0 0 0]
[0 0 0 0]
[1 1]
[1 1 1]
[1 1 1 1]
