# Naive Bayes Classifier for News Article Classification

This notebook contains a Naive Bayes classifier from scikit-learn library for classifying text into news categories.

## Step 1: Installation of Required Libraries (if needed)

If you don't have the libraries installed, run in terminal:
```
pip install scikit-learn seaborn matplotlib
```

## Step 2: Importing Libraries


In [None]:
import json
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported!")


## Step 1: Loading Data


In [None]:
file_path = 'news/labeled_1000.json'

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Number of articles: {len(data)}")
print(f"Sample article: {data[0]}")


## Step 2: Preparing Training and Test Data


In [None]:
texts = []
labels = []

for article in data:
    texts.append(article['text'])
    labels.append(article['label'])

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"Training set: {len(X_train)} articles")
print(f"Test set: {len(X_test)} articles")
print(f"Categories in training set: {Counter(y_train)}")


## Step 3: Creating and Training the Naive Bayes Classifier


In [None]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer(
        lowercase=True,
        token_pattern=r'\b[a-z]+\b',
        min_df=2,
        max_features=10000
    )),
    ('classifier', MultinomialNB(alpha=1.0))
])

print("="*80)
print("NAIVE BAYES CLASSIFIER FROM SCIKIT-LEARN")
print("="*80)
print("\nTraining classifier...")

classifier.fit(X_train, y_train)

print("âœ“ Trained!")
print(f"Number of features (words): {len(classifier.named_steps['vectorizer'].vocabulary_)}")


## Step 4: Prediction and Evaluation


In [None]:
print("Predicting categories for test set...")
predictions = classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions) * 100

print(f"\n{'='*80}")
print(f"ACCURACY: {accuracy:.2f}%")
print(f"{'='*80}")

print("\nDetailed classification report:")
print("="*80)
report = classification_report(y_test, predictions, output_dict=True, zero_division=0)

results = []
for label in sorted(report.keys()):
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        results.append({
            'Category': label,
            'Precision': f"{report[label]['precision']:.4f}",
            'Recall': f"{report[label]['recall']:.4f}",
            'F1-Score': f"{report[label]['f1-score']:.4f}"
        })

df = pd.DataFrame(results)
df['Precision'] = pd.to_numeric(df['Precision'])
df['Recall'] = pd.to_numeric(df['Recall'])
df['F1-Score'] = pd.to_numeric(df['F1-Score'])

styled_df = df.style.format({
    'Precision': '{:.4f}',
    'Recall': '{:.4f}',
    'F1-Score': '{:.4f}'
}).background_gradient(subset=['Precision', 'Recall', 'F1-Score'], cmap='YlGnBu', vmin=0, vmax=1).set_caption(f'Overall Accuracy: {accuracy:.2f}%').set_table_styles([
    {'selector': 'caption', 'props': [('font-size', '14px'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': 'td', 'props': [('text-align', 'center')]}
])

display(styled_df)
print("="*80)


## Step 5: Confusion Matrix Visualization


In [None]:
from sklearn.utils.multiclass import unique_labels

labels = list(unique_labels(y_test, predictions))
cm = confusion_matrix(y_test, predictions, labels=labels)
categories = labels

plt.figure(figsize=(14, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=categories,
    yticklabels=categories,
    cbar_kws={'label': 'Number of samples'},
    linewidths=0.5,
    linecolor='gray'
)
plt.title('Confusion Matrix - News Article Classification', fontsize=16, pad=20)
plt.xlabel('Predicted Category', fontsize=12)
plt.ylabel('Actual Category', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("\nDetailed statistics for each category:")
print("="*80)
for i, category in enumerate(categories):
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    accuracy_cat = tp / (tp + fp + fn) if (tp + fp + fn) > 0 else 0

    print(f"\n{category}:")
    print(f"  Correct predictions: {tp}")
    print(f"  Incorrect predictions: {fp + fn}")
    print(f"  Precision: {precision*100:.2f}%")
    print(f"  Recall: {recall*100:.2f}%")
    print(f"  Accuracy: {accuracy_cat*100:.2f}%")
print("="*80)
