# ***Engr.Muhammad Javed***

# 1. Latent Dirichlet Allocation (LDA)

Generative probabilistic model for collections of discrete data such as text corpora. It discovers abstract topics.

## Key Concepts
- **Document-Topic Distribution:** Each document is a mixture of topics.
- **Topic-Word Distribution:** Each topic is a mixture of words.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load Data
df_train = pd.read_csv('../Dataset/train.txt', sep=';', names=['text', 'emotion'])

# Preprocess
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df_train['text'])

# Train LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Display Topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print_top_words(lda, vectorizer.get_feature_names_out(), 10)