## Standard LDA

Do the imports

In [1]:
!pip install pyro-ppl



In [17]:
import argparse
import functools
import logging

import torch
from torch import nn
from torch.distributions import constraints
import functools

import pyro
import pyro.distributions as dist
from pyro.infer import SVI, JitTraceEnum_ELBO, TraceEnum_ELBO
from pyro.contrib.autoguide import AutoDiagonalNormal, AutoMultivariateNormal, AutoGuideList, AutoDelta
from pyro.optim import ClippedAdam

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Get the modifed data matrix, where the documents are of equal length.

In [18]:
import requests, io
r = requests.get('https://github.com/MikkelGroenning/MBML_project/blob/master/data/processed/upsampled_data.npy?raw=true')

data = np.load(io.BytesIO(r.content)).astype('int32')
data.shape

(32211, 1500)

The data consists of 32.211 speeches, each with a length of 1500 words. We therefore only look at a subset of the data.

In [19]:
data_sub = data[:100]
data_sub = np.vectorize({k:v for (k,v) in zip(np.unique(data_sub), np.arange(len(np.unique(data_sub))))}.get)(data_sub)

In [20]:
num_words = data_sub.max() + 1
num_topics = 25
num_docs = data_sub.shape[0]
num_words_per_doc = data_sub.shape[1]

With these things defined we can now make an LDA model.

In [21]:
def model(data=None, batch_size=None):
    """ Make a plate of size num_topics with name "topics" and define a variable "topic_words".
          This represents the phi above. Use the equivalent of a uniform distribution for it  """
    with pyro.plate("topics", num_topics):
        topic_words = pyro.sample("topic_words", dist.Dirichlet(torch.ones(num_words) / num_words))

    """ Make two (nested) plates in here. One over documents and one over words
          Documents, called "documents":
          The plate over the documents should hold a variable "doc_topics" representing the theta above.
            Use the equivalent of a uniform distribution for it.
          
          Words, called "words":
          The plate over words, should have a topic assignment for each word (z_{i,j} above) which 
            should be enumerated.
          The second variable should be the words themselves which should be drawn from the "topic_words"
            using the assigned z_{i,j} and the observed data.

     """
    with pyro.plate("documents", num_docs) as ind:
        if data is not None:
            with pyro.util.ignore_jit_warnings():
                assert data.shape == (num_words_per_doc, num_docs)
            data = data[:, ind]
        doc_topics = pyro.sample("doc_topics", dist.Dirichlet(torch.ones(num_topics)/ num_topics))
        with pyro.plate("words", num_words_per_doc):
            # The word_topics variable is marginalized out during inference,
            # achieved by specifying infer={"enumerate": "parallel"} and using
            # TraceEnum_ELBO for inference. Thus we can ignore this variable in
            # the guide.
            word_topics = pyro.sample("word_topics", dist.Categorical(doc_topics), infer={"enumerate": "parallel"})
            data = pyro.sample("doc_words", dist.Categorical(topic_words[word_topics]), obs=data)

    return topic_words, data

In [22]:
W_torch = torch.tensor(data_sub.T).long()
W_torch.shape
del data

First we do LAD like we did in the exercises, first we make a gudie and then run the LDA

In [23]:
pyro.clear_param_store()

def my_local_guide(data=None, batch_size=None):
    topic_words_posterior = pyro.param(
            "topic_words_posterior",
            lambda: torch.ones(num_topics, num_words),
            constraint=constraints.positive)
    with pyro.plate("topics", num_topics):
        pyro.sample("topic_words", dist.Dirichlet(topic_words_posterior))
    
    doc_topics_posterior = pyro.param(
            "doc_topics_posterior",
            lambda: torch.ones(num_docs, num_topics),
            constraint=constraints.simplex)
    with pyro.plate("documents", num_docs, batch_size) as ind:
        pyro.sample("doc_topics", dist.Delta(doc_topics_posterior[ind], event_dim=1))
    
guide = AutoGuideList(model)
guide.add(AutoDiagonalNormal(pyro.poutine.block(model, expose=['doc_topics'])))
guide.add(my_local_guide)  # automatically wrapped in an AutoCallable

guide = my_local_guide

elbo = TraceEnum_ELBO(max_plate_nesting=3)

optim = ClippedAdam({'llommer': 0.05})
svi = SVI(model, guide, optim, elbo)

# Define the number of optimization steps
n_steps = 750

# do gradient steps
for step in range(n_steps):
    elbo = svi.step(W_torch, batch_size=16)
    if step % 25 == 0:
        #print('.', end='')
        print("[%d] ELBO: %.1f" % (step, elbo))

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 9806400000 bytes. Buy new RAM!
(no backtrace available)

## Amortized LDA

We use amortized inference of the local variables. This is acheived by using a multi-layer perceptron.

In [24]:
layer_sizes = np.arange(98,103)
layer_sizes = torch.tensor(layer_sizes)
print(layer_sizes.size())

def make_predictor(num_words, layer_sizes):
    layer_sizes = ([num_words] +
                   [int(s) for s in torch.split(layer_sizes,1)] +
                   [num_topics])
    logging.info('Creating MLP with sizes {}'.format(layer_sizes))
    layers = []
    for in_size, out_size in zip(layer_sizes, layer_sizes[1:]):
        layer = nn.Linear(in_size, out_size)
        layer.weight.data.normal_(0, 0.001)
        layer.bias.data.normal_(0, 0.001)
        layers.append(layer)
        layers.append(nn.Sigmoid())
    layers.append(nn.Softmax(dim=-1))
    return nn.Sequential(*layers)

torch.Size([5])


And the guide

In [None]:
pyro.clear_param_store()

def parametrized_guide(predictor, data, batch_size=None):
    # Use a conjugate guide for global variables.
    topic_weights_posterior = pyro.param(
            "topic_weights_posterior",
            lambda: torch.ones(num_topics),
            constraint=constraints.positive)
    topic_words_posterior = pyro.param(
            "topic_words_posterior",
            lambda: torch.ones(num_topics, num_words),
            constraint=constraints.greater_than(0.5))
    with pyro.plate("topics", num_topics):
        pyro.sample("topic_weights", dist.Gamma(topic_weights_posterior, 1.))
        pyro.sample("topic_words", dist.Dirichlet(topic_words_posterior))

    # Use an amortized guide for local variables.
    pyro.module("predictor", predictor)
    with pyro.plate("documents", num_docs, batch_size) as ind:
        data = data[:, ind]
        # The neural network will operate on histograms rather than word
        # index vectors, so we'll convert the raw data to a histogram.
        counts = (torch.zeros(num_words, ind.size(0)).scatter_add(0, data, torch.ones(data.shape)))
        doc_topics = predictor(counts.transpose(0, 1))
        pyro.sample("doc_topics", dist.Delta(doc_topics, event_dim=1))

learning_rate = 0.05

predictor = make_predictor(num_words, layer_sizes)
guide = functools.partial(parametrized_guide, predictor)
# Elbo = JitTraceEnum_ELBO if args.jit else TraceEnum_ELBO
elbo = TraceEnum_ELBO(max_plate_nesting=3)
optim = ClippedAdam({'lr': learning_rate})
svi = SVI(model, guide, optim, loss=elbo)

# Define the number of optimization steps
n_steps = 5

# do gradient steps
for step in range(n_steps):
    elbo = svi.step(W_torch, batch_size=2)
    if step % 1 == 0:
        #print('.', end='')
        print("[%d] ELBO: %.1f" % (step, elbo))

## LDA Gensim

In [25]:
from src.features.build_features import  vocabulary, X, X_tfidf, corpus, corpus_tfidf

In [26]:
id2word = {v:k for v, k in sorted((value, key) for (key,value) in vocabulary.items())}

In [27]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

model = LdaModel(
    corpus=corpus[:500],
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [28]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -2.0881.
[([(0.008943842, 'fru'),
   (0.00655985, 'venstr'),
   (0.006472275, 'statsminist'),
   (0.0060022757, 'kr'),
   (0.005951143, 'kris'),
   (0.00567152, 'peng'),
   (0.0056012115, 'sf'),
   (0.0055634277, 'mennesk'),
   (0.0055069444, 'politik'),
   (0.005256012, 'radikal'),
   (0.0051922924, 'land'),
   (0.0051377746, 'økonomisk'),
   (0.0050867614, 'hr'),
   (0.005072784, 'folkeparti'),
   (0.0049760165, 'økonomi'),
   (0.004265044, 'ung'),
   (0.004113977, 'ansvar'),
   (0.003916533, 'samfund'),
   (0.0039004118, 'nye'),
   (0.0037868014, 'konservativ')],
  -1.0390022561287233),
 ([(0.025277242, 'kommun'),
   (0.013765038, 'lovforslag'),
   (0.008171043, 'borg'),
   (0.007006078, 'københavn'),
   (0.00602456, 'hr'),
   (0.005409857, 'land'),
   (0.004537714, 'folkeparti'),
   (0.0043712156, 'enkelt'),
   (0.0043359795, 'stud'),
   (0.004196492, 'netop'),
   (0.004156884, 'forskel'),
   (0.0039674025, 'eu'),
   (0.0039598, 'regl'),
   (0.0038858296, '