In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from gensim.models import LdaModel, CoherenceModel
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
from bertopic import BERTopic
from nltk.corpus import stopwords
import spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import wordcloud
from wordcloud import WordCloud

ModuleNotFoundError: No module named 'bertopic'

In [2]:
pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl (158 kB)
[K     |████████████████████████████████| 158 kB 7.1 MB/s eta 0:00:01
[?25hCollecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.36.tar.gz (6.1 MB)
[K     |████████████████████████████████| 6.1 MB 43.3 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting sentence-transformers>=0.4.1
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[K     |████████████████████████████████| 227 kB 45.6 MB/s eta 0:00:01
[?25hCollecting umap-learn>=0.5.0
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[K     |████████████████████████████████| 85 kB 14.4 MB/s eta 0:00:01
Collecting cython<3,>=0.27
  Using cached Cython-0.29.37-py2.py3-none-any.whl (989 kB)
Collecting transformers<5.0.0,>=4.34.0
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[K

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Load dataset
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
documents = newsgroups_train.data
labels = newsgroups_train.target
label_names = newsgroups_train.target_names

In [None]:
# EDA
# Dataset summary
print("Number of documents:", len(documents))
print("Number of categories:", len(label_names))
print("Categories:", label_names)

In [None]:
# Distribution of categories
category_counts = pd.Series(labels).value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(x=category_counts.index, y=category_counts.values, palette="viridis")
plt.xlabel('Category')
plt.ylabel('Number of Documents')
plt.title('Distribution of Categories in 20 Newsgroups Dataset')
plt.xticks(ticks=np.arange(len(label_names)), labels=label_names, rotation=90)
plt.show()

In [None]:
# Document length analysis
document_lengths = [len(doc.split()) for doc in documents]
plt.figure(figsize=(10, 5))
plt.hist(document_lengths, bins=50, color='blue', alpha=0.7)
plt.xlabel('Document Length (words)')
plt.ylabel('Number of Documents')
plt.title('Distribution of Document Lengths')
plt.show()

In [None]:
# Preprocessing
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stop_words = stopwords.words('english')

In [None]:
def preprocess(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

In [None]:
documents = [preprocess(doc) for doc in documents]

In [None]:
# Vectorization
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(documents)

In [None]:
# LDA (Gensim)
# Explanation: LDA (Latent Dirichlet Allocation) is a generative probabilistic model that assumes each document is a mixture of topics and each topic is a mixture of words.
corpus = Sparse2Corpus(X, documents_columns=False)
id2word = Dictionary.from_corpus(corpus, id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))
lda_model_gensim = LdaModel(corpus, num_topics=10, id2word=id2word, passes=10)

In [None]:
# LDA (scikit-learn)
# Explanation: Another implementation of LDA using scikit-learn which follows the same probabilistic approach.
lda_model_sklearn = LatentDirichletAllocation(n_components=10, random_state=0)
lda_topics_sklearn = lda_model_sklearn.fit_transform(X)

In [None]:
# NMF
# Explanation: NMF (Non-negative Matrix Factorization) is a linear algebra technique that factorizes the document-term matrix into non-negative matrices. It’s useful for parts-based representation.
nmf_model = NMF(n_components=10, random_state=0)
nmf_topics = nmf_model.fit_transform(X)

In [None]:
# LSA (TruncatedSVD)
# Explanation: LSA (Latent Semantic Analysis) uses SVD (Singular Value Decomposition) to reduce the dimensionality of the document-term matrix, capturing the underlying structure in the data.
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(documents)
lsa_model = TruncatedSVD(n_components=10, random_state=0)
lsa_topics = lsa_model.fit_transform(X_tfidf)

In [None]:
# BERTopic
# Explanation: BERTopic leverages BERT embeddings and clustering algorithms to identify topics in text data. It captures nuanced meanings in documents due to advanced contextual embeddings.
bertopic_model = BERTopic(language="english")
bertopic_topics, probs = bertopic_model.fit_transform(documents)

In [None]:
# Evaluation (Coherence Score for LDA)
# Explanation: Coherence score measures the quality of the topics. Higher coherence scores indicate more interpretable topics.
coherence_model_lda = CoherenceModel(model=lda_model_gensim, texts=[doc.split() for doc in documents], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [None]:
# Evaluation (Coherence Score for BERTopic)
topic_words = [topic[1] for topic in bertopic_model.get_topics().values()]
coherence_model_bertopic = CoherenceModel(topics=topic_words, texts=[doc.split() for doc in documents], coherence='c_v')
coherence_bertopic = coherence_model_bertopic.get_coherence()

In [None]:
# Print top words for each topic for each model
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


In [None]:
n_top_words = 10
print("LDA Topics (scikit-learn):")
print_top_words(lda_model_sklearn, vectorizer.get_feature_names_out(), n_top_words)

print("NMF Topics:")
print_top_words(nmf_model, vectorizer.get_feature_names_out(), n_top_words)

print("LSA Topics:")
print_top_words(lsa_model, tfidf_vectorizer.get_feature_names_out(), n_top_words)

print("BERTopic Topics:")
print(bertopic_model.get_topic_info())

print(f"LDA Coherence Score (Gensim): {coherence_lda}")
print(f"BERTopic Coherence Score: {coherence_bertopic}")

In [None]:
# Visualize LDA topics using pyLDAvis
lda_vis = gensimvis.prepare(lda_model_gensim, corpus, id2word)
pyLDAvis.save_html(lda_vis, 'lda_gensim.html')

In [None]:
# Wordclouds for top words in each model
def plot_wordcloud(model, feature_names, title):
    plt.figure(figsize=(10, 5))
    wordcloud = WordCloud(width=800, height=400, max_words=50, colormap='viridis').generate_from_frequencies(dict(zip(feature_names, model.components_.flatten())))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

In [None]:
plot_wordcloud(lda_model_sklearn, vectorizer.get_feature_names_out(), "Wordcloud for LDA (scikit-learn)")
plot_wordcloud(nmf_model, vectorizer.get_feature_names_out(), "Wordcloud for NMF")
plot_wordcloud(lsa_model, tfidf_vectorizer.get_feature_names_out(), "Wordcloud for LSA")

In [None]:
# BERTopic visualization
fig = bertopic_model.visualize_topics()
fig.show()

In [None]:
# Comparison of coherence scores
model_names = ['LDA (Gensim)', 'BERTopic']
coherence_scores = [coherence_lda, coherence_bertopic]

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=model_names, y=coherence_scores, palette='viridis')
plt.xlabel('Model')
plt.ylabel('Coherence Score')
plt.title('Coherence Scores of Topic Models')
plt.show()