<a href="https://colab.research.google.com/github/ISL-0111/GDP_and_CO2/blob/main/Copy_of_Data_Engineering_Project_9_ISL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Class : Natural Language Processing
Assignment : Latent Dirichlet Allocation
Name : Ilseop Lee
"""

import numpy as np
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from typing import List


# LDA Generator Function
def lda_gen(
    vocabulary: List[str], alpha: np.ndarray, beta: np.ndarray, xi: int
) -> List[str]:
    k = len(alpha)  # Number of topics
    V = len(vocabulary)  # Vocabulary size
    doc_length = np.random.poisson(xi)  # Poisson distribution with parameter xi
    theta = np.random.dirichlet(alpha)  # Sample the topic distribution

    words = []
    for _ in range(doc_length):
        topic = np.random.choice(
            k, p=theta
        )  # Sample a topic according to the topic distribution

        word_idx = np.random.choice(
            V, p=beta[topic]
        )  # Sample a word from the chosen topic using the topic-word distribution (beta)
        words.append(vocabulary[word_idx])

    return words


# Function to compare true and inferred topic-word distributions
def compare_true_and_inferred_topics(true_beta, inferred_beta, vocabulary):
    print("True Beta Matrix (Topic-Word Distribution):")
    for i, topic_dist in enumerate(true_beta):
        print(
            f"Topic {i + 1}: ",
            [f"{vocabulary[j]} ({prob:.2f})" for j, prob in enumerate(topic_dist)],
        )

    print("\nInferred Beta Matrix (Topic-Word Distribution):")
    for topic_id, topic_words in inferred_beta:
        word_dict = {word: prob for word, prob in topic_words}
        ordered_topic = [
            f"{vocabulary[i]} ({word_dict.get(vocabulary[i], 0):.2f})"
            for i in range(len(vocabulary))
        ]
        print(f"Topic {topic_id + 1}: {', '.join(ordered_topic)}")


# Test
def test():
    """Test the LDA generator."""
    vocabulary = ["bass", "pike", "deep", "tuba", "horn", "catapult"]
    beta = np.array(
        [
            [0.4, 0.4, 0.2, 0.0, 0.0, 0.0],
            [0.0, 0.3, 0.1, 0.0, 0.3, 0.3],
            [0.3, 0.0, 0.2, 0.3, 0.2, 0.0],
        ]
    )
    alpha = np.array([0.2, 0.2, 0.2])
    xi = 50

    # Generate random documents = 100
    documents = [lda_gen(vocabulary, alpha, beta, xi) for _ in range(50)]

    dictionary = Dictionary(documents)  # 'Bag of words'
    corpus = [dictionary.doc2bow(text) for text in documents]

    # Train LDA model by using Gensim
    model = LdaModel(corpus, id2word=dictionary, num_topics=3)

    # Display inferred alpha (topic distribution per document)
    print("Inferred alpha (topic distribution per document):")
    print(model.alpha)

    # Display inferred topics
    inferred_beta = model.show_topics(formatted=False)

    # Compare true and inferred beta matrices
    compare_true_and_inferred_topics(beta, inferred_beta, vocabulary)


if __name__ == "__main__":
    test()


