
# BBC News Classification — End-to-End ML & NLP Project

**Deliverables included in this single notebook:**
- Data loading & preprocessing (cleaning, stopwords, lemmatization)
- Baseline model (TF-IDF + Logistic Regression / Naive Bayes) with full evaluation
- NLP upgrade (DistilBERT fine-tuning with Hugging Face)
- Results comparison (metrics & confusion matrices)
- Ethics check (bias, risks, mitigations)
- Mini deployment: writes a `streamlit_app.py` you can run with `streamlit run streamlit_app.py`

> Tip: Use a GPU runtime for faster DistilBERT training. CPU works but is slower.


## 0) Setup & Installs

In [None]:

# Uncomment if needed (recommended in a fresh environment):
# !pip install --quiet pandas numpy scikit-learn matplotlib nltk tqdm seaborn
# !pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# !pip install --quiet transformers datasets evaluate accelerate
# !pip install --quiet streamlit



## 1) Data Loading & Preprocessing

This notebook supports two dataset formats:
1. **CSV** at `data/bbc.csv` with columns: `text`, `category`  
2. **Folder-of-texts** at `data/bbc/<category>/*.txt`

We will:
- Clean text (lowercase, remove punctuation/stopwords, lemmatize)
- Split into train/test


In [None]:

import os, re, string, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

CSV_PATH = DATA_DIR / "bbc.csv"
FOLDER_PATH = DATA_DIR / "bbc"
print("Looking for:", CSV_PATH.resolve())
print("Looking for:", FOLDER_PATH.resolve())



### (Optional) Download helper
Paste a direct CSV URL if you have one and run the cell.


In [None]:

import urllib.request

CSV_DIRECT_URL = ""  # e.g. "https://raw.githubusercontent.com/<user>/<repo>/main/bbc.csv"
if CSV_DIRECT_URL:
    print("Downloading CSV ...")
    urllib.request.urlretrieve(CSV_DIRECT_URL, CSV_PATH)
    print("Saved to", CSV_PATH.resolve())
else:
    print("No CSV_DIRECT_URL provided.")


### Text Cleaning Utilities

In [None]:

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources
try:
    _ = stopwords.words("english")
except LookupError:
    nltk.download("stopwords")

try:
    _ = nltk.corpus.wordnet.ensure_loaded()
except LookupError:
    nltk.download("wordnet"); nltk.download("omw-1.4")

STOPWORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"http\S+|www\S+", " ", s)
    s = s.translate(str.maketrans("", "", string.punctuation))
    tokens = re.findall(r"[a-z]+", s)
    tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 1]
    lemmas = [LEMMATIZER.lemmatize(t) for t in tokens]
    return " ".join(lemmas)


### Load BBC Dataset (CSV or folder-of-texts)

In [None]:

def load_bbc_csv(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    assert {"text", "category"}.issubset(df.columns), "CSV must have 'text' and 'category' columns."
    return df[["text","category"]].dropna()

def load_bbc_folder(folder_path: Path) -> pd.DataFrame:
    rows = []
    if not folder_path.exists():
        return pd.DataFrame(columns=["text","category"])
    for cat_dir in folder_path.iterdir():
        if cat_dir.is_dir():
            label = cat_dir.name
            for txt in cat_dir.glob("*.txt"):
                try:
                    text = txt.read_text(encoding="latin-1")
                except Exception:
                    text = txt.read_text(errors="ignore")
                rows.append({"text": text, "category": label})
    return pd.DataFrame(rows)

if CSV_PATH.exists():
    df = load_bbc_csv(CSV_PATH)
else:
    df = load_bbc_folder(FOLDER_PATH)

print("Raw dataset shape:", df.shape)
display(df.head())


### Apply Cleaning

In [None]:

df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
df["clean_text"] = df["text"].apply(clean_text)
print("After cleaning:")
display(df.head())

print("\nCategory counts:")
display(df["category"].value_counts())


## 2) Train / Test Split

In [None]:

from sklearn.model_selection import train_test_split

X = df["clean_text"].values
y = df["category"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
len(X_train), len(X_test)


## 3) Baseline Model — TF‑IDF + (Logistic Regression / Naive Bayes)

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

pipe_lr = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=2000))
])

pipe_nb = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ("clf", MultinomialNB())
])

models = {"LogReg": pipe_lr, "NaiveBayes": pipe_nb}
metrics_baseline = {}

for name, mdl in models.items():
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    acc = accuracy_score(y_test, preds)
    p, r, f1, _ = precision_recall_fscore_support(y_test, preds, average="weighted", zero_division=0)
    metrics_baseline[name] = {"accuracy": acc, "precision": p, "recall": r, "f1": f1}
    print(f"\n=== {name} Classification Report ===")
    print(classification_report(y_test, preds, zero_division=0))

baseline_df = pd.DataFrame(metrics_baseline).T.sort_values("f1", ascending=False)
display(baseline_df)

best_baseline_name = baseline_df.index[0]
best_baseline = models[best_baseline_name]
print("Best baseline:", best_baseline_name)

# Save best baseline pipeline
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True, parents=True)
joblib.dump(best_baseline, MODEL_DIR / "baseline_pipeline.joblib")
print("Saved baseline to models/baseline_pipeline.joblib")


### Confusion Matrix (Best Baseline)

In [None]:

labels = sorted(pd.unique(y_test))
y_pred_base = best_baseline.predict(X_test)

cm = confusion_matrix(y_test, y_pred_base, labels=labels)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted"); plt.ylabel("True"); plt.title(f"Confusion Matrix — {best_baseline_name}")
plt.show()


## 4) NLP Upgrade — DistilBERT Fine‑Tuning (Hugging Face)

In [None]:

import torch
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import evaluate
import numpy as np
import os

# Prepare raw texts/labels (use the original 'text' to keep full context)
le = LabelEncoder()
labels_all = le.fit_transform(df["category"].values)
id2label = {i: lab for i, lab in enumerate(le.classes_)}
label2id = {lab: i for i, lab in id2label.items()}

df_all = pd.DataFrame({"text": df["text"].astype(str), "label": labels_all})
df_all["split"] = "train"
test_idx = pd.Series(df_all.index).sample(frac=0.2, random_state=42)
df_all.loc[test_idx, "split"] = "test"

hf_train = Dataset.from_pandas(df_all[df_all["split"]=="train"][["text","label"]], preserve_index=False)
hf_test  = Dataset.from_pandas(df_all[df_all["split"]=="test"][["text","label"]], preserve_index=False)

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True)

tok_train = hf_train.map(tokenize_fn, batched=True, remove_columns=["text"])
tok_test  = hf_test.map(tokenize_fn, batched=True, remove_columns=["text"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="hf_runs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_train,
    eval_dataset=tok_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

eval_metrics = trainer.evaluate()
print("DistilBERT eval:", eval_metrics)

# Save model + label encoder
DISTIL_DIR = Path("models/distilbert")
DISTIL_DIR.mkdir(exist_ok=True, parents=True)
trainer.save_model(DISTIL_DIR.as_posix())
tokenizer.save_pretrained(DISTIL_DIR.as_posix())
import joblib as jl
jl.dump(le, DISTIL_DIR / "label_encoder.joblib")
print("Saved DistilBERT model to", DISTIL_DIR.resolve())


## 5) Results Comparison

In [None]:

# Build a comparison table using baseline_df (from earlier) and DistilBERT eval
distil_metrics = {
    "accuracy": eval_metrics.get("eval_accuracy", np.nan),
    "precision": np.nan,  # not directly computed; could add precision metric similarly
    "recall": np.nan,     # not directly computed; could add recall metric similarly
    "f1": eval_metrics.get("eval_f1", np.nan)
}
comparison = baseline_df.copy()
comparison.loc["DistilBERT"] = distil_metrics
display(comparison)


### Confusion Matrix — DistilBERT

In [None]:

# Build predictions with the trained DistilBERT model to calculate confusion matrix
from transformers import TextClassificationPipeline
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False, device=-1)
true_labels = hf_test["label"]
pred_labels = []
for t in hf_test["text"] if "text" in hf_test.column_names else df_all.loc[df_all["split"]=="test","text"].tolist():
    pred = pipe(t, truncation=True)
    # pred is like [{'label': 'LABEL_0', 'score': 0.99}] or actual label names if id2label set
    lab = pred[0]["label"]
    # Map string label to numeric
    if lab in label2id:
        pred_labels.append(label2id[lab])
    else:
        # If label is like "LABEL_0"
        try:
            pred_labels.append(int(lab.split("_")[-1]))
        except:
            pred_labels.append(0)

labels_order = list(range(len(le.classes_)))
cm = confusion_matrix(true_labels, pred_labels, labels=labels_order)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted"); plt.ylabel("True"); plt.title("Confusion Matrix — DistilBERT")
plt.show()



## 6) Ethics Check

- **Potential bias in training data**
  - Category imbalance can over-represent certain topics, leading to skewed predictions.
  - Language style bias (e.g., formal BBC tone) may not generalize to informal texts or different dialects.
  - Temporal bias: older articles reflect outdated contexts; topics evolve.
- **Risks of misclassification**
  - Misinforming users if a sensitive news topic is mislabeled (e.g., politics vs. entertainment).
  - Downstream systems (recommenders/moderation) may amplify errors.
  - Erosion of trust if categories appear inconsistent.
- **Mitigations**
  - Use **stratified splits** and monitor **per-class metrics** (precision/recall) not just accuracy.
  - Apply **data augmentation** or **class weighting** for minority classes.
  - Perform **human-in-the-loop** review for edge cases; log predictions for auditability.
  - Provide **model cards** and clear **confidence scores**; set thresholds and abstain/”uncertain” routes.


## 7) Mini Deployment — Streamlit App Writer

In [None]:

from pathlib import Path
import textwrap

app_code = textwrap.dedent('''
import streamlit as st
import joblib
from pathlib import Path
import numpy as np

st.set_page_config(page_title="BBC News Classifier", page_icon="📰", layout="centered")
st.title("📰 BBC News Classifier")
st.write("Enter a news article and get the predicted category with a confidence score.")

BASELINE_PATH = Path("models/baseline_pipeline.joblib")
DISTIL_DIR = Path("models/distilbert")

mode = st.radio("Model", ["Baseline (TF-IDF + LR/NB)", "DistilBERT"], horizontal=True)

text = st.text_area("Paste news text here:", height=200)

def softmax(x):
    e = np.exp(x - np.max(x))
    return e / e.sum()

if st.button("Predict") and text.strip():
    if mode == "Baseline (TF-IDF + LR/NB)":
        if BASELINE_PATH.exists():
            pipe = joblib.load(BASELINE_PATH)
            pred = pipe.predict([text])[0]
            # Use decision_function/proba if available
            conf = None
            if hasattr(pipe[-1], "predict_proba"):
                proba = pipe[-1].predict_proba(pipe[0].transform([text]))[0]
                conf = float(np.max(proba))
            st.success(f"**Prediction:** {pred}")
            if conf is not None:
                st.write(f"**Confidence:** {conf:.3f}")
        else:
            st.error("Baseline model not found. Train and save it by running the notebook cells.")
    else:
        try:
            from transformers import AutoTokenizer, AutoModelForSequenceClassification
            import torch
            import numpy as np

            if DISTIL_DIR.exists():
                tok = AutoTokenizer.from_pretrained(DISTIL_DIR.as_posix())
                mdl = AutoModelForSequenceClassification.from_pretrained(DISTIL_DIR.as_posix())
                enc = tok([text], truncation=True, return_tensors="pt")
                with torch.no_grad():
                    out = mdl(**enc).logits[0].numpy()
                probs = softmax(out)
                idx = int(np.argmax(probs))
                # Load label encoder to map index->class string
                le = joblib.load(DISTIL_DIR / "label_encoder.joblib")
                label = le.classes_[idx]
                st.success(f"**Prediction:** {label}")
                st.write(f"**Confidence:** {probs[idx]:.3f}")
            else:
                st.error("DistilBERT model not found. Train and save it by running the notebook cells.")
        except Exception as e:
            st.exception(e)
''')

Path("streamlit_app.py").write_text(app_code, encoding="utf-8")
print("Wrote streamlit_app.py")
