Cybersecurity - Understanding the dataset

# **Load dataset and libraries**

# Exploratory Data Analysis (EDA) – Cybersecurity News Articles

## Objective
Understand the dataset structure, label distribution, and text characteristics to guide preprocessing and modeling decisions.

## What this notebook covers
- Dataset overview (columns, missingness, duplicates)
- Label distribution and class imbalance
- Text length analysis (tokens/characters)
- Common terms and n-grams (high-level signal checks)

## Output
EDA findings used to inform preprocessing choices and downstream model evaluation.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import re

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
wnl = WordNetLemmatizer()

In [None]:
from urllib import request
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load Data for review and understanding the structure

In [None]:
cyber = pd.read_excel("../data/TheHackerNews_Dataset.xlsx")

print(cyber.head())
print(cyber.shape)
print(cyber.columns)

In [None]:
list(cyber.columns)

In [None]:
cyber.head(5)

In [None]:
cyber.shape

In [None]:
len(cyber)

In [None]:
cyber["Article"].count()

In [None]:
cyber["Article"].isna().sum()

In [None]:
cyber

# **Sentence and Word counts**

In [None]:
raw_text = cyber["Article"].iloc[0]
print(raw_text)

In [None]:
import nltk
nltk.download('punkt_tab')

sentences = nltk.sent_tokenize(cyber["Article"].iloc[0])
print(sentences[:5])  # first 5 sentences

In [None]:
words = nltk.word_tokenize(cyber["Article"].iloc[0])
print(words[:20])  # first 20 words

In [None]:
words_clean = [w.lower() for w in words if w.isalpha()]
print(words_clean[:20])

In [None]:
cyber["sent_count"] = cyber["Article"].apply(lambda x: len(nltk.sent_tokenize(x)))
print(cyber["sent_count"].head())

In [None]:
cyber.nlargest(5, "sent_count")[["Title", "sent_count"]]

In [None]:
cyber["word_count"] = cyber["Article"].apply(lambda x: len(nltk.word_tokenize(x)))
cyber[["sent_count", "word_count"]].describe()

In [None]:
def classify_article_length(word_count):
    if word_count < 100:
        return "short_alert"
    elif word_count < 800:
        return "medium_report"
    else:
        return "long_analysis"

cyber["article_type"] = cyber["word_count"].apply(classify_article_length)
cyber["article_type"].value_counts()

In [None]:
cyber["full_text"] = cyber["Title"] + " " + cyber["Article"]

cyber["full_word_count"] = cyber["full_text"].apply(
    lambda x: len(nltk.word_tokenize(x))
)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.hist(cyber["full_word_count"], bins=40, color='steelblue', edgecolor='black')
plt.title("Distribution of Word Counts (Title + Article)")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
counts, bins, _ = plt.hist(cyber["full_word_count"], bins=40)
plt.close()  # Hide the plot since we only want the numbers

print("Bin edges:", bins)
print("Counts per bin:", counts)

In [None]:
import matplotlib.pyplot as plt

counts, bins, patches = plt.hist(cyber["full_word_count"], bins=40, edgecolor='black')

for count, bin_edge in zip(counts, bins[:-1]):
    plt.text(bin_edge, count, str(int(count)), fontsize=8, rotation=90)

plt.title("Word Count Distribution")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Get percentage distribution
label_percent = cyber["Label"].value_counts(normalize=True) * 100

# Plot
plt.figure(figsize=(8, 5))
plt.bar(label_percent.index, label_percent.values, color='steelblue')
plt.title("Percentage Distribution of Labels")
plt.ylabel("Percentage (%)")
plt.xlabel("Label")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Calculate percentage distribution
label_percent = cyber["Label"].value_counts(normalize=True) * 100

plt.figure(figsize=(8, 5))
bars = plt.bar(label_percent.index, label_percent.values, color='steelblue')

# Add percentage labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f"{height:.2f}%",
        ha='center',
        va='bottom',
        fontsize=10
    )

plt.title("Distribution of Labels")
plt.ylabel("Percentage (%)")
plt.xlabel("Label")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 8))   # Increase width and height for presentation

counts, bins, patches = plt.hist(
    cyber["full_word_count"],
    bins=40,
    edgecolor='black'
)

for count, bin_edge in zip(counts, bins[:-1]):
    plt.text(bin_edge, count, str(int(count)), fontsize=10, rotation=90)

plt.title("Word Count Distribution", fontsize=18)
plt.xlabel("Word Count", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
cyber["full_word_count"].describe()

In [None]:
import numpy as np

data = cyber["full_word_count"]

five_num = {
    "min": np.min(data),
    "q1": np.percentile(data, 25),
    "median": np.percentile(data, 50),
    "q3": np.percentile(data, 75),
    "max": np.max(data)
}

five_num

In [None]:
plt.figure(figsize=(8, 4))
plt.boxplot(cyber["full_word_count"], vert=False)
plt.title("Boxplot of Word Counts (Title + Article)")
plt.xlabel("Word Count")
plt.show()

# **Start Cleaning Text - Chapter 3 (normalize, regex and remove non-alpha)**

In [None]:
TEXT_COL = "Article"

def clean_text_basic(text):
    if pd.isna(text):
        return ""

    text = str(text)
    text = text.lower()

    text = re.sub(r'http\S+|www\.\S+', " ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [None]:
cyber["Article_clean"] = cyber[TEXT_COL].apply(clean_text_basic)

In [None]:
cyber[[TEXT_COL, "Article_clean"]].head(5)

In [None]:
cyber = cyber[cyber["Article_clean"].str.len() > 0].copy()
cyber.shape

# **Tokenize and Remove Stopwords - Chapter 3**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
len(stop_words), list(stop_words)[:20]

Here I define the function that takes a clean string and tokenizes it, removes stopwords and keeps only alphabetic tokens as per chapter 3

In [None]:
def tokenize_and_remove_stopwords(text):
    if not isinstance(text, str) or text.strip() == "":
        return []
    tokens = word_tokenize(text)
    filtered_tokens = [
        w.lower() for w in tokens
        if w.isalpha() and w.lower() not in stop_words
    ]

    return filtered_tokens

In [None]:
cyber["Article_tokens"] = cyber["Article_clean"].apply(tokenize_and_remove_stopwords)

# NOTE: join back into a space separated string for later vectorization in notebook
cyber["Article_no_stop"] = cyber["Article_tokens"].apply(lambda toks: " ".join(toks))

In [None]:
cyber[["Article_clean", "Article_tokens"]].head(3)

In [None]:
cyber[["Article_clean", "Article_no_stop"]].head(3)

# **Lemmatization with part of speech (POS) - Chapter 3 ARI/stemming exercise & Chapter 7**

In [None]:
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

Function to Lemmatize the list of tokens

In [None]:
def lemmatize_tokens(tokens):
    if not isinstance(tokens, list) or len(tokens) == 0:
        return []

    tagged = pos_tag(tokens)
    lemmas = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tagged
    ]

    return lemmas

Article_lemma_str → space-joined string (for TF-IDF / models)

In [None]:

cyber["Article_lemmas"] = cyber["Article_tokens"].apply(lemmatize_tokens)
cyber["Article_lemma_str"] = cyber["Article_lemmas"].apply(lambda toks: " ".join(toks))

# Frequency, ngrams, pos counts, np chunks from Chap 3 and 7 notes

In [None]:
from nltk import FreqDist

all_lemmas = []
for text in cyber["Article_lemma_str"]:
  if isinstance(text, str):
    all_lemmas.extend(text.split())

freq_dist = FreqDist(all_lemmas)
freq_dist.most_common(30)

In [None]:
import matplotlib.pyplot as plt

freq_df = pd.DataFrame(freq_dist.most_common(30), columns=["word", "count"])
freq_df

In [None]:
import matplotlib.pyplot as plt

top_words = freq_dist.most_common(30)
words = [w for w, c in top_words]
counts = [c for w, c in top_words]

plt.figure(figsize=(12, 6))
plt.bar(words, counts)
plt.xticks(rotation=75)
plt.title('Top 30 Most Frequent Lemmas')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

Bigram & Trigram Frequencies

In [None]:
from nltk import ngrams


bigrams = list(ngrams(all_lemmas, 2))
trigrams = list(ngrams(all_lemmas, 3))

bigram_freq = FreqDist(bigrams).most_common(20)
trigram_freq = FreqDist(trigrams).most_common(20)

bigram_freq, trigram_freq

In [None]:
# Separate items into text and counts for chart
bigrams_text = [' '.join(b) for b, c in bigram_freq]
bigrams_count = [c for b, c in bigram_freq]

plt.figure(figsize=(12, 6))
plt.bar(bigrams_text, bigrams_count)
plt.xticks(rotation=75)
plt.title('Top 20 Bigrams in Cybersecurity Articles')
plt.xlabel('Bigrams')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
trigrams_text = [' '.join(t) for t, c in trigram_freq]
trigrams_count = [c for t, c in trigram_freq]

plt.figure(figsize=(12, 6))
plt.bar(trigrams_text, trigrams_count)
plt.xticks(rotation=75)
plt.title('Top 20 Trigrams in Cybersecurity Articles')
plt.xlabel('Trigrams')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

POS tagging

In [None]:
sample = cyber["Article_clean"].iloc[0]
tokens = nltk.word_tokenize(sample)
pos_tags = nltk.pos_tag(tokens)

pos_tags[:50]

Chunking


In [None]:
!pip install svgling

In [None]:
grammar = r"""
    NP:
        {<DT>?<JJ.*>*<NN.*>+}          # basic pattern
        {<NNP.*>+}                     # proper noun groups (Microsoft, Azure Cosmos DB)
        {<NN.*><IN><NN.*>}             # "access to data"
        {<NN.*><CC><NN.*>}             # "malware and ransomware"
"""
cp = nltk.RegexpParser(grammar)
tree = cp.parse(pos_tags)


In [None]:
def extract_nps(text):
    if not isinstance(text, str) or text.strip() == "":
        return []

    tokens = word_tokenize(text)
    tags = pos_tag(tokens)

    tree = cp.parse(tags)


    noun_phrases = [
        " ".join(word for word, tag in subtree.leaves())
        for subtree in tree.subtrees()
        if subtree.label() == "NP"
    ]

    return noun_phrases

In [None]:
cyber["NP_chunks"] = cyber["Article_clean"].apply(
    lambda x: " ".join(extract_nps(x))
)

cyber["enhanced_text"] = cyber["Article_lemma_str"] + " " + cyber["NP_chunks"]

cyber[["Title", "Article_clean", "Article_lemma_str", "NP_chunks", "enhanced_text"]].head(3)

In [None]:
tree

In [None]:

cyber["NP_chunks"] = cyber["Article_clean"].apply(lambda x: " ".join(extract_nps(x)))

cyber["enhanced_text"] = cyber["Article_lemma_str"] + " " + cyber["NP_chunks"]


In [None]:
cyber[["Title", "Article_clean", "Article_lemma_str", "NP_chunks", "enhanced_text"]].head(3)

Sentiment Analysis

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

cyber["sentiment"] = cyber["enhanced_text"].apply(
    lambda x: sia.polarity_scores(x)["compound"] if isinstance(x, str) else 0)

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

cyber["sentiment"] = cyber["Article_clean"].apply(lambda x: sia.polarity_scores(x)["compound"])
cyber["sentiment"].describe()

In [None]:
cyber.groupby("Label")["sentiment"].mean().sort_values()

Step 3.1 — Most Frequent Words

1. Import Counter

Bar Chart

# Recap

In [None]:
'''
+1.0 → Very positive
-1.0 → Very negative
~0.0 → Neutral'''

cyber["sentiment"] = cyber["Article_clean"].apply(lambda x: sia.polarity_scores(x)["compound"])

In [None]:
cyber["sentiment"].describe()

In [None]:
cyber.nlargest(10, "sentiment")[["Title", "sentiment"]]
cyber.nsmallest(10, "sentiment")[["Title", "sentiment"]]

END OF EDA - JUDY