In [1]:
%load_ext autoreload
%autoreload 2

# Load Dataset

In [2]:
import json
import nltk
from sklearn.model_selection import train_test_split

# load dataset & get all unique words in the category
total_dataset = []
with open("News_Category_Dataset_v3.json") as f:
  for data in f:
    total_dataset.append(json.loads(data))
      
stemmer = nltk.SnowballStemmer("english")
x = []
for d in total_dataset:
    text = f"{d['headline']} {d['short_description']}".lower()
    text = " ".join(stemmer.stem(word) for word in text.split(" "))
    x.append(text)
y = [d['category'] for d in total_dataset]

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
print(f"Training Samples: {len(x_train)}")
print(f"Validation Samples: {len(x_val)}")
print(f"Categories: {len(set(y_train))}")

Training Samples: 167621
Validation Samples: 41906
Categories: 42


# Naive Bayes Classifier

### Training

In [3]:
from collections import defaultdict

# Training priors: log P(c)
all_cats = list(set(y_train))
prior = {cat: 0 for cat in all_cats}
for cat in y_train:
    prior[cat] += 1
# Convert to log space
for cat in prior:
    prior[cat] = prior[cat] / len(y_train)

# Create vocabulary and initialize likelihoods with Laplace smoothing
vocabulary = set()
for text in x_train:
    vocabulary.update(text.split())
print(len(vocabulary))

# Training likelihoods: log P(d|c)
likelihood = {cat: defaultdict(lambda: 1) for cat in all_cats}  # Laplace smoothing
word_counts = {cat: len(vocabulary) for cat in all_cats}  # Initialize counts with smoothing

# Count occurrences
for text, cat in zip(x_train, y_train):
    for word in text.split():
        likelihood[cat][word] += 1
        word_counts[cat] += 1

# Convert to log probabilities
for cat in likelihood:
    for word in likelihood[cat]:
        likelihood[cat][word] = likelihood[cat][word] / word_counts[cat]

171330


In [5]:
params = len(prior) + len(vocabulary) + len(likelihood)

print(f"Number of Parameters: {params}")

Number of Parameters: 171414


### Testing

In [4]:
import math

pred = []
for val_text in x_val:
    # Initialize log probabilities with priors for each category
    log_probs = {cat: math.log(prior[cat]) for cat in all_cats}
    
    # Add log likelihoods for each word
    for word in val_text.split():
        for cat in all_cats:
            # Skip if word not in vocabulary (handled by smoothing during training)
            if word in likelihood[cat]:
                log_probs[cat] += math.log(likelihood[cat][word]) # Already in log space
            else:
                log_probs[cat] += math.log(1.0/len(vocabulary)) # Already in log space 
    
    # Get category with highest probability
    pred.append(max(log_probs, key=log_probs.get))

# Calculate accuracy
accuracy = sum(1 for p, y in zip(pred, y_val) if p == y) / len(y_val)
print(f"Naive Bayes Classifier Accuracy: {accuracy * 100:.2f}%")

Naive Bayes Classifier Accuracy: 43.99%
