# mport Required Libraries

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import corpora, models
import gensim
import matplotlib.pyplot as plt
from wordcloud import WordCloud

nltk.download('stopwords')


#  Load Dataset

In [None]:
# Example: CSV with a column 'news'
df = pd.read_csv('/kaggle/input/topic-modelling-nlp-amazon-reviews-bbc-news/bbc_data.csv')  # change path to your file
df = df.dropna()
texts = df['news'].astype(str)


# Preprocess the Text

In [None]:
ps = PorterStemmer()
stop_words = stopwords.words('english')
custom_stopwords = ['n', 'said', 'also', 'get', 'could', 'e', 'would', 'us', 'b', 'mr']

def process(text):
    s = text.lower()
    s = ''.join([char for char in s if char.isalpha() or char == ' '])
    s = s.split()
    s = [word for word in s if word not in stop_words and word not in custom_stopwords]
    s = [ps.stem(word) for word in s]
    return s

processed_news = [process(doc) for doc in texts]


# Create Dictionary and Corpus

In [None]:
dictionary = corpora.Dictionary(processed_news)
corpus = [dictionary.doc2bow(tokens) for tokens in processed_news]


# Train LDA Model

In [None]:
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10, random_state=42)


# Display Topics

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}:\n{topic}\n")


# Word Cloud per Topic

In [None]:
for t in range(lda_model.num_topics):
    plt.figure()
    plt.imshow(WordCloud(background_color='white').fit_words(dict(lda_model.show_topic(t, 30))))
    plt.axis("off")
    plt.title(f"Topic #{t}")
    plt.show()


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)