## Homework 2: Latent Dirichlet Allocation

**ECE 684:** Natural Language Processing
<br>
**Name:** Guillem Amat Castello

In [158]:
import pandas as pd
import numpy as np

<br>

## Q1: Latent Dirichlet Allocation Implementation

**LDA Implementation**

In [159]:
def LDA(vocabulary: list, beta: np.array, alpha: np.array, xi: int) -> np.array:
    word_list = []
    N = np.random.poisson(xi)
    theta = np.random.dirichlet(alpha)
    for word in range(N):
        z = np.where(np.random.multinomial(1, theta) == 1)
        word = np.where(np.random.multinomial(1, beta[z[0][0]])== 1)
        word_list.append(vocabulary[word[0][0]])
    return word_list

<br>

**Algorithm Test**

In [160]:
vocabulary = ['bass', 'pike', 'deep', 'tuba', 'horn', 'catapult']
beta = np.array([[0.4, 0.4, 0.2, 0.0, 0.0, 0.0],
                 [0.0, 0.3, 0.1, 0.0, 0.3, 0.3],
                 [0.3, 0.0, 0.2, 0.3, 0.2, 0.0]])
alpha = np.array([1, 3, 8])
xi = 50

In [161]:
document = LDA(vocabulary, beta, alpha, xi)

In [162]:
unique, counts = np.unique(document, return_counts=True)
dict(zip(unique, counts))

{'bass': 19, 'catapult': 1, 'deep': 15, 'horn': 14, 'pike': 2, 'tuba': 9}

<br>

## Q2: Parameter Inference

**Creating LDA Model**

In [163]:
import gensim
from gensim import corpora, models

In [164]:
documents = 100
documents = [LDA(vocabulary, beta, alpha, xi) for i in range(documents)]

In [165]:
word_dictionary = corpora.Dictionary(documents)
corpus = [word_dictionary.doc2bow(document) for document in documents]

In [166]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word = word_dictionary,
                                            num_topics=3, 
                                            random_state=100,
                                            update_every=1,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

**Extracted Topics**

In [167]:
print(lda_model.print_topics())

[(0, '0.230*"horn" + 0.219*"deep" + 0.200*"pike" + 0.152*"catapult" + 0.118*"tuba" + 0.081*"bass"'), (1, '0.317*"tuba" + 0.254*"deep" + 0.198*"horn" + 0.184*"bass" + 0.028*"catapult" + 0.018*"pike"'), (2, '0.257*"bass" + 0.222*"horn" + 0.208*"tuba" + 0.134*"deep" + 0.104*"pike" + 0.075*"catapult"')]
