In [None]:
# Import core libraries for data handling and NLP
import pandas as pd
import spacy
import nltk
import re


# Import tools for tokenization and stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Load the BBC news dataset from a CSV file
bbc_data = pd.read_csv("bbc_news.csv")

# Display the first few rows of the dataset
bbc_data.head()

# Display dataset structure and data types
bbc_data.info()

In [None]:
# Extract the title column and store it in a new DataFrame
titles = pd.DataFrame(bbc_data["title"], columns=["title"])

# Preview the extracted titles
titles.head()

In [None]:
# ---------------------------------------------
# Convert all titles to lowercase
# This ensures consistency (e.g., "War" and "war" are treated the same)
# ---------------------------------------------
titles["titles_lowercase"] = titles["title"].str.lower()

# ---------------------------------------------
# Load English stopwords from NLTK
# ---------------------------------------------
from nltk.corpus import stopwords
eng_stopwords = stopwords.words("english")

# ---------------------------------------------
# Remove stopwords from the lowercase titles
# - Split each title into words
# - Keep only words NOT in the stopword list
# - Join them back into a single string
# ---------------------------------------------
titles["review_no_stopwords"] = titles["titles_lowercase"].apply(
    lambda x: " ".join(
        word for word in x.split() if word not in eng_stopwords
    )
)

# ---------------------------------------------
# Remove punctuation using regular expressions
# [^\w\s] matches anything that is NOT a word character or whitespace
# ---------------------------------------------
import re
titles["review_no_stopwords_no_punct"] = titles["review_no_stopwords"].apply(
    lambda x: re.sub(r"[^\w\s]", "", x)
)

# ---------------------------------------------
# Tokenize the cleaned text
# This converts each title into a list of words
# ---------------------------------------------
from nltk.tokenize import word_tokenize
titles["tokenized"] = titles["review_no_stopwords_no_punct"].apply(
    lambda x: word_tokenize(x)
)


# ---------------------------------------------
# Initialize the Porter Stemmer
# ---------------------------------------------
from nltk.stem import PorterStemmer
ps = PorterStemmer()

# ---------------------------------------------
# Apply stemming to each token in every title
# ---------------------------------------------
titles["stemmed"] = titles["tokenized"].apply(
    lambda tokens: [ps.stem(word) for word in tokens]
)

# ---------------------------------------------
# Download WordNet data (required for lemmatization)
# ---------------------------------------------
import nltk
nltk.download("wordnet")

# ---------------------------------------------
# Initialize the WordNet Lemmatizer
# ---------------------------------------------
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# ---------------------------------------------
# Apply lemmatization to the stemmed tokens
# Note: Lemmatization is applied word by word
# ---------------------------------------------
titles["lemmatized"] = titles["stemmed"].apply(
    lambda tokens: [lemmatizer.lemmatize(word) for word in tokens]
)

# ---------------------------------------------
# (FIXED) No need to tokenize again
# The data is already a list of tokens
# ---------------------------------------------
titles["lemmatized_clean"] = titles["lemmatized"]

# ---------------------------------------------
# Combine all raw tokens into one list
# Start with an empty list so lists concatenate correctly
# ---------------------------------------------
tokens_raw_list = sum(titles["tokenized"], [])
print("First 10 raw tokens:")
print(tokens_raw_list[:10])

# ---------------------------------------------
# Combine all cleaned + lemmatized tokens into one list
# ---------------------------------------------
tokens_clean_list = sum(titles["lemmatized_clean"], [])
print("\nFirst 10 cleaned & lemmatized tokens:")
print(tokens_clean_list[:10])
 
# ---------------------------------------------
# View the processed DataFrame
# ---------------------------------------------
titles.head()


In [None]:
# ---------------------------------------------
from nltk.util import ngrams
import pandas as pd

# ---------------------------------------------
# Flatten all lemmatized tokens into one list
# This combines tokens from every title
# ---------------------------------------------
lemmatized_clean = sum(titles["lemmatized_clean"], [])

# ---------------------------------------------
# Generate unigrams (n = 1)
# ---------------------------------------------
unigrams = list(ngrams(lemmatized_clean, 1))
unigram_series = pd.Series(unigrams).value_counts()

# ---------------------------------------------
# Generate bigrams (n = 2)
# (FIXED: previously n was incorrectly set to 1)
# ---------------------------------------------
bigrams = list(ngrams(lemmatized_clean, 2))
bigram_series = pd.Series(bigrams).value_counts()

# ---------------------------------------------
# Display the results
# ---------------------------------------------
print("Top Unigrams:")
print(unigram_series.head(10))

print("\nTop Bigrams:")
print(bigram_series.head(10))


In [None]:
# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Convert token list into a single string
text_for_pos = " ".join(tokens_raw_list)

# Create a spaCy document
doc = nlp(text_for_pos)

# Create an empty DataFrame to store tokens and POS tags
pos_df = pd.DataFrame(columns=["token", "pos_tag"])

# Loop through each token in the document
for token in doc:
    # Create a one-row DataFrame for the current token
    row = pd.DataFrame.from_records(
        [{"token": token.text, "pos_tag": token.pos_}]
    )
    # Append the row to the main DataFrame
    pos_df = pd.concat([pos_df, row], ignore_index=True)

# Display the first 15 tokens and their POS tags
pos_df.head(15)

In [None]:
# --------------------------------
# Count how often each token-POS pair appears
# --------------------------------
pos_df_counts = (
    pos_df
    .groupby(["token", "pos_tag"])
    .size()
    .reset_index(name="counts")
    .sort_values("counts", ascending=False)
)

# Show the 10 most common token + POS combinations
pos_df_counts.head(10)

In [None]:
# --------------------------------
# Extract the top 10 most frequent nouns
# --------------------------------
top_nouns = pos_df_counts[pos_df_counts["pos_tag"] == "NOUN"].head(10)

# Print the top nouns
print(top_nouns)

# --------------------------------
# Extract the top 10 most frequent verbs
# --------------------------------
top_verbs = pos_df_counts[pos_df_counts["pos_tag"] == "VERB"].head(10)

# Print the top nouns
print(top_verbs)

# --------------------------------
# Extract the top 10 most frequent Adjectives
# --------------------------------
top_adj = pos_df_counts[pos_df_counts["pos_tag"] == "ADJ"].head(10)

# Print the top nouns
print(top_adj)

In [None]:
# Create an empty DataFrame for named entities
ner_df = pd.DataFrame(columns=["token", "ner_tag"])

# Extract named entities from the spaCy document
for ent in doc.ents:
     if pd.isna(ent.label_) is False:
        row = pd.DataFrame.from_records(
            [{"token": ent.text, "ner_tag": ent.label_}]
        )
        ner_df = pd.concat([ner_df, row], ignore_index=True)

ner_df.head()

In [None]:
# Count frequency of named entities
ner_counts = (
    ner_df
    .groupby(["token", "ner_tag"])
    .size()
    .reset_index(name="counts")
    .sort_values("counts", ascending=False)
)


# Display the most common named entities
ner_counts.head(10)