# LDA Topic Modeling â€“ Cybersecurity News Articles

## Objective
Discover underlying themes in cybersecurity news articles using Latent Dirichlet Allocation (LDA) and interpret how topics align with threat categories.

## What this notebook contains
- Text preprocessing (cleaning, tokenization, lemmatization)
- Bigram/trigram phrase detection
- Dictionary + corpus construction
- LDA model training and topic interpretation

## Output
Interpretable topic clusters that help explain label overlap and inform downstream classification.

In [None]:
# Setup note:
# If running locally in a fresh environment, install dependencies using:
#   pip install -r requirements.txt
# and download the spaCy model:
#   python -m spacy download en_core_web_sm

In [None]:
# STEP 2 - LOAD THE DATASET

import pandas as pd
df = pd.read_excel("../data/TheHackerNews_Dataset.xlsx")
texts = df["Article"].fillna("").astype(str).tolist()
print(texts[:20])

In [None]:
# STEP 3 - PREPOCESS USING SPACY ( TOKENIZE + REMOVE STOPWORDS + LEMMATIZE)

import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])

def preprocess_many(text_list):
    processed = []
    
    # nlp.pipe takes a list (or generator) of our texts and processes them in batches.
    for doc in nlp.pipe((t.lower() for t in text_list), batch_size=20):
        tokens = [
            token.lemma_
            for token in doc
            if not token.is_stop
            and not token.is_punct
            and token.is_alpha
            and len(token) > 3
        ]
        processed.append(tokens)
    
    return processed

# Now we apply it to all our articles
processed_docs = preprocess_many(texts)

print("Number of documents:", len(processed_docs))
print("First doc tokens:", processed_docs[0][:40])

In [None]:
# STEP 4  - CREATE DICTIONARY & CORPUS
from gensim import corpora

# Create dictionary from processed docs
dictionary = corpora.Dictionary(processed_docs)

# remove very rare and very common words
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create bag-of-words corpus
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print("Vocabulary size:", len(dictionary))
print("Sample for first doc:", corpus[0][:10])

In [None]:
# STEP 5 - RUN GENSIM LDA MODEL
from gensim.models.ldamodel import LdaModel

num_topics = 6  

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha="auto",
    eta="auto"
)

In [None]:
# STEP 6 - PRINT THE TOPICS

for i, topic in lda_model.show_topics(num_topics=6, num_words=10, formatted=False):
    print(f"\nTopic {i}:")
    print(", ".join([word for word, prob in topic]))

In [None]:
# Using Latent Dirichlet Allocation (LDA), six dominant topics emerged from our dataset:
#(1) software vulnerabilities and patch releases,
#(2) server-level threats and malicious campaigns,
#(3) Android/mobile application security,
#(4) ransomware and large-scale cyberattacks,
#(5) corporate data breaches affecting customer information, and
#(6) website/social media account compromises.
# These themes align closely with real-world cybersecurity incidents commonly reported in technology news.

In [None]:
# STEP 7 - COMPARE LABELS & GET DOMINANT TOPIC PER DOCUMENT

def get_dominant_topic(model, bow_doc):
    """
    For one document in bag-of-words form (bow_doc),
    return the topic id with the highest probability.
    """
    topic_probs = model.get_document_topics(bow_doc)
    if not topic_probs:
        return None
 
    dominant_topic_id, max_prob = max(topic_probs, key=lambda x: x[1])
    return dominant_topic_id


dominant_topics = [get_dominant_topic(lda_model, bow_doc) for bow_doc in corpus]

# Joined our original dataframe so labels + topics live in one place
df["Dominant_Topic"] = dominant_topics

df[["Title", "Label", "Dominant_Topic"]].head()


In [None]:
# STEP 8 - TABLE: TRUE LABEL vs DOMINANT LDA TOPIC

# Raw counts
topic_label_counts = pd.crosstab(df["Label"], df["Dominant_Topic"])
display(topic_label_counts)

# Row-wise percentages (how each label is distributed over topics)
topic_label_pct = pd.crosstab(df["Label"],
                              df["Dominant_Topic"],
                              normalize="index") * 100
topic_label_pct = topic_label_pct.round(1)
display(topic_label_pct)


In [None]:
# Data_Breaches has some overlap with topics 3 and 5, so the classifier later might confuse these.
# Malware spans across four topics, so ML models will struggle the most here.

#Most Confusing / Overlapping Labels
# - Cyber_Attack - Malware (topics 1 & 3)
# - Data_Breaches - Cyber_Attack (topic 3)
#- Vulnerability - Malware (topic 5)

# Cleanest label
# - Vulnerability (strong Topic 0 dominance)

#Hardest label for modeling
# - Malware (very spread out - high overlap)

In [None]:
# STEP 9 - VISUALIZATIONS

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# VISUAL 1: Heatmap (raw counts)
ct = pd.crosstab(df["Label"], df["Dominant_Topic"])

ct.columns = [f"Topic {i+1}" for i in ct.columns]

plt.figure(figsize=(10, 6))
sns.heatmap(ct, annot=True, cmap="Blues", fmt="d")
plt.title("Heatmap: True Label vs Dominant LDA Topic")
plt.xlabel("LDA Topic")
plt.ylabel("True Label")
plt.show()

# VISUAL 2: Percentage heatmap 
ct_pct = pd.crosstab(
    df["Label"],
    df["Dominant_Topic"],
    normalize="index"
) * 100

ct_pct.columns = [f"Topic {i+1}" for i in ct_pct.columns]

plt.figure(figsize=(10, 6))
sns.heatmap(ct_pct.round(1), annot=True, cmap="Greens", fmt=".1f")
plt.title("Percentage Distribution of LDA Topics per True Label")
plt.xlabel("LDA Topic")
plt.ylabel("True Label")
plt.show()

# VISUAL 3: Bar plot of topic frequencies

topic_counts = df["Dominant_Topic"].value_counts().sort_index()
topic_labels = [f"Topic {i+1}" for i in topic_counts.index]

plt.figure(figsize=(8, 5))
sns.barplot(x=topic_labels, y=topic_counts.values)
plt.title("Overall Distribution of LDA Topics")
plt.xlabel("LDA Topic")
plt.ylabel("Document Count")
plt.show()