# 🔬 BBC News Pipeline Comparison

**Compare feature extraction, dimensionality reduction, and classifier combinations**

- **Dataset:** BBC News (2225 articles, 5 categories)
- **Pipeline:** Feature Extractor → Dimensionality Reducer → Classifier
- **Goal:** Find the optimal combination for text classification

---

**Source:** [AI Learning Hub](https://ltsach.github.io/AILearningHub/)


In [None]:
# Install required packages
!pip install -q scikit-learn pandas numpy plotly

In [None]:
import time
import urllib.request
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

## 📥 Download Dataset

We'll download the BBC News dataset from GitHub Pages.

In [None]:
def download_bbc_news():
    """Download BBC News dataset"""
    base_url = 'https://ltsach.github.io/AILearningHub/datasets/bbcnews/data/'
    files = ['train.csv', 'test.csv']
    
    data_dir = Path('bbc_data')
    data_dir.mkdir(exist_ok=True)
    
    for filename in files:
        filepath = data_dir / filename
        if filepath.exists():
            print(f"✓ {filename} already exists")
        else:
            print(f"⬇️  Downloading {filename}...")
            url = base_url + filename
            urllib.request.urlretrieve(url, filepath)
            print(f"✅ Downloaded {filename}")
    
    return data_dir

# Download
data_dir = download_bbc_news()

# Load data
train_df = pd.read_csv(data_dir / 'train.csv')
test_df = pd.read_csv(data_dir / 'test.csv')

X_train = train_df['text'].values
y_train = train_df['category'].values
X_test = test_df['text'].values
y_test = test_df['category'].values

print(f"\n📊 Dataset Info:")
print(f"   Train: {len(X_train)} samples")
print(f"   Test: {len(X_test)} samples")
print(f"   Classes: {sorted(set(y_train))}")

## 🔧 Pipeline Training Function

In [None]:
def train_pipeline(X_train, X_test, y_train, y_test, 
                   extractor, extractor_name,
                   reducer, reducer_name,
                   classifier, classifier_name):
    """Train a single pipeline and return results"""
    
    pipeline_name = f"{extractor_name} → {reducer_name} → {classifier_name}"
    print(f"🔬 {pipeline_name}", end=" ")
    
    try:
        # Feature extraction
        start = time.time()
        X_train_vec = extractor.fit_transform(X_train)
        X_test_vec = extractor.transform(X_test)
        extract_time = time.time() - start
        
        features_before = X_train_vec.shape[1]
        
        # Dimensionality reduction
        if reducer is not None:
            start = time.time()
            if reducer_name == 'Chi²':
                X_train_vec = np.abs(X_train_vec.toarray())
                X_test_vec = np.abs(X_test_vec.toarray())
                X_train_vec = reducer.fit_transform(X_train_vec, y_train)
                X_test_vec = reducer.transform(X_test_vec)
            else:
                X_train_vec = reducer.fit_transform(X_train_vec)
                X_test_vec = reducer.transform(X_test_vec)
            reduce_time = time.time() - start
        else:
            reduce_time = 0
        
        features_after = X_train_vec.shape[1]
        
        # Classification
        start = time.time()
        
        if classifier_name == 'Naive Bayes':
            if hasattr(X_train_vec, 'toarray'):
                X_train_vec = X_train_vec.toarray()
                X_test_vec = X_test_vec.toarray()
            X_train_vec = np.abs(X_train_vec)
            X_test_vec = np.abs(X_test_vec)
        
        classifier.fit(X_train_vec, y_train)
        train_time = time.time() - start
        
        # Prediction
        start = time.time()
        y_pred = classifier.predict(X_test_vec)
        infer_time = (time.time() - start) / len(y_test) * 1000
        
        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        print(f"→ ✓ Accuracy: {accuracy*100:.2f}%")
        
        return {
            'pipeline': pipeline_name,
            'extractor': extractor_name,
            'reducer': reducer_name,
            'classifier': classifier_name,
            'accuracy': accuracy,
            'train_time': extract_time + reduce_time + train_time,
            'inference_ms': infer_time,
            'features_before': features_before,
            'features_after': features_after,
            'confusion_matrix': cm,
            'status': 'success'
        }
        
    except Exception as e:
        print(f"→ ❌ Error: {e}")
        return {'status': 'failed', 'pipeline': pipeline_name}

## 🚀 Compare All Pipeline Combinations

We'll test:
- **2 Feature Extractors:** TF-IDF, Bag of Words
- **3 Reducers:** None, PCA, Chi²
- **4 Classifiers:** Logistic, SVM, Naive Bayes, Random Forest

**Total: 24 pipelines**

In [None]:
# Define components
extractors = [
    (TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=2, max_df=0.8), 'TF-IDF'),
    (CountVectorizer(max_features=5000, ngram_range=(1,2), min_df=2), 'BoW'),
]

reducers = [
    (None, 'None'),
    (PCA(n_components=300), 'PCA'),
    (SelectKBest(score_func=chi2, k=500), 'Chi²'),
]

classifiers = [
    (LogisticRegression(max_iter=1000, random_state=42), 'Logistic'),
    (LinearSVC(max_iter=1000, random_state=42), 'SVM'),
    (MultinomialNB(), 'Naive Bayes'),
    (RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1), 'Random Forest'),
]

# Train all combinations
print(f"Training {len(extractors) * len(reducers) * len(classifiers)} pipelines...\n")

all_results = []

for extractor, ext_name in extractors:
    for reducer, red_name in reducers:
        for classifier, clf_name in classifiers:
            from sklearn.base import clone
            result = train_pipeline(
                X_train, X_test, y_train, y_test,
                clone(extractor), ext_name,
                clone(reducer) if reducer else None, red_name,
                clone(classifier), clf_name
            )
            all_results.append(result)

# Filter successful
results = [r for r in all_results if r.get('status') == 'success']
print(f"\n✅ Successfully trained {len(results)} pipelines")

## 📊 Visualization 1: Accuracy Comparison

In [None]:
# Sort by accuracy
sorted_results = sorted(results, key=lambda x: -x['accuracy'])

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[r['pipeline'] for r in sorted_results],
    y=[r['accuracy']*100 for r in sorted_results],
    marker_color='#667eea',
    text=[f"{r['accuracy']*100:.2f}%" for r in sorted_results],
    textposition='outside'
))

fig.update_layout(
    title='Pipeline Accuracy Comparison',
    xaxis_title='Pipeline',
    yaxis_title='Accuracy (%)',
    xaxis_tickangle=-45,
    height=700,
    yaxis_range=[75, 100],
    margin=dict(b=200)
)

fig.show()

# Show top 5
print("\n🏆 Top 5 Pipelines:")
for i, r in enumerate(sorted_results[:5], 1):
    print(f"{i}. {r['pipeline']}: {r['accuracy']*100:.2f}% (Train: {r['train_time']:.2f}s)")

## ⚖️ Visualization 2: Accuracy vs Speed Trade-off

In [None]:
fig = go.Figure()

for reducer in set(r['reducer'] for r in results):
    filtered = [r for r in results if r['reducer'] == reducer]
    fig.add_trace(go.Scatter(
        x=[r['train_time'] for r in filtered],
        y=[r['accuracy']*100 for r in filtered],
        mode='markers+text',
        name=reducer,
        text=[r['classifier'] for r in filtered],
        textposition='top center',
        marker=dict(size=12)
    ))

fig.update_layout(
    title='Trade-off: Accuracy vs Training Speed',
    xaxis_title='Training Time (seconds, log scale)',
    yaxis_title='Accuracy (%)',
    height=600,
    xaxis_type='log'
)

fig.show()

## 🌡️ Visualization 3: Accuracy Heatmap

In [None]:
# Create heatmap
extractors_list = sorted(set(r['extractor'] for r in results))
reducers_list = sorted(set(r['reducer'] for r in results))
classifiers_list = sorted(set(r['classifier'] for r in results))

x_labels = [f"{e}+{r}" for e in extractors_list for r in reducers_list]
z_data = []

for clf in classifiers_list:
    row = []
    for ext in extractors_list:
        for red in reducers_list:
            match = next((r for r in results 
                        if r['extractor']==ext and r['reducer']==red and r['classifier']==clf), None)
            row.append(match['accuracy']*100 if match else None)
    z_data.append(row)

fig = go.Figure(data=go.Heatmap(
    z=z_data,
    x=x_labels,
    y=classifiers_list,
    colorscale='RdYlGn',
    text=[[f"{val:.1f}%" if val else "" for val in row] for row in z_data],
    texttemplate='%{text}',
    colorbar=dict(title='Accuracy (%)')
))

fig.update_layout(
    title='Accuracy Heatmap: Classifier vs Pipeline Configuration',
    xaxis_title='Extractor + Reducer',
    yaxis_title='Classifier',
    height=500,
    xaxis_tickangle=-45
)

fig.show()

## 🎯 Visualization 4: Confusion Matrices (Top 3)

In [None]:
top3 = sorted(results, key=lambda x: -x['accuracy'])[:3]

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=[f"{r['pipeline']}<br>Acc: {r['accuracy']*100:.1f}%" for r in top3],
    specs=[[{'type': 'heatmap'}]*3]
)

labels = sorted(set(y_test))

for idx, r in enumerate(top3, 1):
    fig.add_trace(
        go.Heatmap(
            z=r['confusion_matrix'],
            x=labels,
            y=labels,
            colorscale='Blues',
            showscale=(idx==3)
        ),
        row=1, col=idx
    )

fig.update_xaxes(title_text='Predicted')
fig.update_yaxes(title_text='True')
fig.update_layout(height=400, title_text='Top 3 Pipelines - Confusion Matrices')

fig.show()

## 💡 Key Insights

**General Findings:**
- **TF-IDF** usually outperforms Bag of Words
- **Dimensionality reduction** trades accuracy for speed and model size
- **Logistic Regression**: Best balance of accuracy and speed
- **Naive Bayes**: Fastest but slightly lower accuracy
- **Random Forest**: Slowest with minimal accuracy benefit

**Recommendations:**
- ✅ **Production**: TF-IDF → None → Logistic (best accuracy)
- ⚡ **Fast baseline**: TF-IDF → None → Naive Bayes (fastest)
- 💾 **Small model**: TF-IDF → Chi² → Naive Bayes (compact)
- ⚖️ **Balanced**: TF-IDF → Chi² → Logistic (good compromise)

---

**Source:** [AI Learning Hub - Pipeline Comparison](https://ltsach.github.io/AILearningHub/04_Natural_Language_Processing/text_classification/pages/pipeline_comparison/)
