In [8]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import ollama
import torch
import hdbscan
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from tqdm import tqdm

In [2]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/himel/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/himel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/himel/nltk_data...


True

In [5]:
# Load dataset
df = pd.read_csv("bbc_text_cls.csv")

# Extend stopwords
extra_stopwords = {"said", "year", "people", "new", "time", "play", "told"}
stop_words = set(stopwords.words('english')).union(extra_stopwords)

# Initialize NLP tools
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Function to clean and preprocess text."""
    text = text.lower()  # Lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    words = word_tokenize(text)  # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]  # Lemmatization
    return " ".join(words)  # Return string format for BERTopic

# Apply preprocessing
df["clean_text"] = df["text"].apply(preprocess_text)

In [7]:
# Load DeepSeek Model
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [9]:
def get_ollama_embeddings(texts):
    """Encodes text using Ollama Mistral embeddings."""
    embeddings = []
    for text in tqdm(texts, desc="Generating embeddings"):
        response = ollama.embeddings(model="mistral", prompt=text)
        embeddings.append(response["embedding"])
    return np.array(embeddings)

In [11]:
# Generate embeddings
embeddings = get_ollama_embeddings(df["clean_text"].tolist())

Generating embeddings: 100%|██████████| 2225/2225 [02:59<00:00, 12.40it/s]


In [21]:
# Save embeddings as a .npy file (Best for reloading in Python)
np.save("bbc_mistral_embeddings.npy", embeddings)

In [23]:
# Apply HDBSCAN clustering to discover topics
clusterer = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=5, metric="euclidean", cluster_selection_method="eom")
clusters = clusterer.fit_predict(embeddings)

In [30]:
df["topic"] = clusters

In [32]:
# Extract top words per topic using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.9, min_df=5, stop_words="english")
X = vectorizer.fit_transform(df["clean_text"])
feature_names = vectorizer.get_feature_names_out()

In [33]:
# Get top words for each cluster
num_words = 10
topics_words = []
for cluster in np.unique(clusters):
    if cluster == -1:  # Ignore noise points
        continue
    cluster_docs = df[df["topic"] == cluster]["clean_text"]
    cluster_vectorized = vectorizer.transform(cluster_docs)
    top_word_indices = np.argsort(cluster_vectorized.toarray().sum(axis=0))[-num_words:]
    top_words = [feature_names[i] for i in top_word_indices]
    topics_words.append(top_words)

In [34]:
print("Extracted Topics:", topics_words)
if len(topics_words) == 0:
    print("⚠️ No valid topics were found! Check HDBSCAN clustering.")


Extracted Topics: [['tax', 'tory', 'government', 'film', 'party', 'game', 'blair', 'brown', 'election', 'labour'], ['wale', 'try', 'england', 'italy', 'goal', 'penalty', 'ireland', 'half', 'ball', 'minute']]


In [35]:
# Compute NPMI Coherence Score
id2word = Dictionary(df["clean_text"].apply(str.split))  # Gensim dictionary
corpus = [id2word.doc2bow(text.split()) for text in df["clean_text"]]  # Convert text to BoW

In [36]:
coherence_model = CoherenceModel(topics=topics_words, texts=df["clean_text"].apply(str.split), dictionary=id2word, coherence='c_npmi')

In [37]:
npmi_score = coherence_model.get_coherence()

In [39]:
# Compute Topic Diversity (TD) Score
unique_words = set(word for topic in topics_words for word in topic)  # Unique words across topics
total_words = len(topics_words) * num_words  # Total top words
td_score = len(unique_words) / total_words  # TD = Unique words / Total words

In [40]:
# Display Results
print(f"🔹 NPMI Coherence Score: {npmi_score:.4f}")
print(f"🔹 Topic Diversity (TD) Score: {td_score:.4f}")

🔹 NPMI Coherence Score: 0.1000
🔹 Topic Diversity (TD) Score: 1.0000


In [41]:
# Save results
results = {
    "NPMI Score": npmi_score,
    "Topic Diversity": td_score
}
pd.DataFrame([results]).to_csv("bbc_ollama_mistral_topic_evaluation.csv", index=False)