# BBC News Text Classification
## TF-IDF + Traditional Machine Learning

**Dataset:** BBC News (2225 articles, 5 categories)

**Methods:** Naive Bayes, Logistic Regression, SVM, Random Forest, MLP, XGBoost

**Source:** [AI Learning Hub](https://ltsach.github.io/AILearningHub/04_Natural_Language_Processing/text_classification/)

---

## 1. Install Dependencies

In [None]:
!pip install -q plotly xgboost
print('✅ Dependencies installed')

## 2. Import Libraries

In [None]:
import time
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

print('✅ Libraries imported')

## 3. Download Dataset
BBC News dataset hosted on GitHub Pages

In [None]:
base_url = 'https://ltsach.github.io/AILearningHub/datasets/bbcnews/data/'

train_df = pd.read_csv(base_url + 'train.csv')
val_df = pd.read_csv(base_url + 'val.csv')
test_df = pd.read_csv(base_url + 'test.csv')

print(f'✓ Train: {len(train_df):,} samples')
print(f'✓ Val: {len(val_df):,} samples')
print(f'✓ Test: {len(test_df):,} samples')
print(f'✓ Categories: {sorted(train_df["category"].unique().tolist())}')

# Combine train + val
train_full = pd.concat([train_df, val_df], ignore_index=True)
print(f'\n✓ Combined training set: {len(train_full):,} samples')

## 4. TF-IDF Feature Extraction
Convert text to numerical features using Term Frequency - Inverse Document Frequency

In [None]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Unigrams + bigrams
    min_df=2,
    max_df=0.8,
    stop_words='english'
)

start = time.time()
X_train = vectorizer.fit_transform(train_full['text'])
X_test = vectorizer.transform(test_df['text'])
elapsed = time.time() - start

print(f'✓ Vocabulary: {len(vectorizer.get_feature_names_out()):,} features')
print(f'✓ Train shape: {X_train.shape}')
print(f'✓ Test shape: {X_test.shape}')
print(f'✓ Extraction time: {elapsed:.2f}s')

In [None]:
# Encode labels for XGBoost compatibility
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(train_full['category'])
y_test_encoded = label_encoder.transform(test_df['category'])

y_train = train_full['category']
y_test = test_df['category']

print(f'✓ Label encoding: {len(label_encoder.classes_)} classes')
print(f'  Classes: {label_encoder.classes_}')

## 5. Training Function

In [None]:
def train_classifier(name, model, X_train, y_train, X_test, y_test):
    """Train and evaluate a classifier"""
    print(f'\n{"="*60}')
    print(f'{name.upper()}')
    print('='*60)
    
    # Train
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start
    
    # Predict
    start = time.time()
    y_pred = model.predict(X_test)
    inference_time = time.time() - start
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f'⏱️  Training: {train_time:.2f}s')
    print(f'⏱️  Inference: {inference_time:.3f}s')
    print(f'📊 Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)')
    print(f'📊 Precision: {precision:.4f} ({precision*100:.2f}%)')
    print(f'📊 Recall: {recall:.4f} ({recall*100:.2f}%)')
    print(f'📊 F1-Score: {f1:.4f} ({f1*100:.2f}%)')
    
    return {
        'name': name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'train_time': train_time,
        'inference_speed': len(y_test) / inference_time if inference_time > 0 else 0,
        'confusion_matrix': cm
    }

## 6. Train All Classifiers

In [None]:
results = []

# 1. Naive Bayes
r = train_classifier('Naive Bayes', MultinomialNB(), X_train, y_train, X_test, y_test)
results.append(r)

# 2. Logistic Regression
r = train_classifier('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42), X_train, y_train, X_test, y_test)
results.append(r)

# 3. SVM
r = train_classifier('SVM', LinearSVC(max_iter=2000, random_state=42), X_train, y_train, X_test, y_test)
results.append(r)

# 4. Random Forest
r = train_classifier('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1), X_train, y_train, X_test, y_test)
results.append(r)

# 5. MLP
r = train_classifier('MLP', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42), X_train, y_train, X_test, y_test)
results.append(r)

# 6. XGBoost (with encoded labels)
r = train_classifier('XGBoost', XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1, verbosity=0), X_train, y_train_encoded, X_test, y_test_encoded)
results.append(r)

print('\n' + '='*60)
print('✅ ALL TRAINING COMPLETE!')
print('='*60)

## 7. Results Summary

In [None]:
# Create summary DataFrame
summary_df = pd.DataFrame([{
    'Method': r['name'],
    'Accuracy (%)': f"{r['accuracy']*100:.2f}",
    'Precision (%)': f"{r['precision']*100:.2f}",
    'Recall (%)': f"{r['recall']*100:.2f}",
    'F1-Score (%)': f"{r['f1_score']*100:.2f}",
    'Train Time (s)': f"{r['train_time']:.2f}",
    'Inference Speed (samples/s)': f"{r['inference_speed']:.0f}"
} for r in results])

print(summary_df.to_string(index=False))

## 8. Interactive Visualizations

In [None]:
# Comparison chart
names = [r['name'] for r in results]
accuracies = [r['accuracy']*100 for r in results]
train_times = [r['train_time'] for r in results]

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Accuracy Comparison', 'Training Time Comparison')
)

fig.add_trace(
    go.Bar(x=names, y=accuracies, name='Accuracy (%)', marker_color='rgb(102, 126, 234)'),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=names, y=train_times, name='Time (s)', marker_color='rgb(245, 135, 108)'),
    row=1, col=2
)

fig.update_layout(height=400, showlegend=False, title_text='Performance Comparison')
fig.update_yaxes(title_text='Accuracy (%)', row=1, col=1)
fig.update_yaxes(title_text='Time (seconds)', row=1, col=2)

fig.show()

## 9. Conclusion

**Best Models:**
- **Accuracy:** Logistic Regression & MLP (98.8%)
- **Speed:** Naive Bayes (0.01s training)
- **Balance:** Logistic Regression (fast + accurate)

**Key Insights:**
- TF-IDF + simple classifiers achieve 96-99% accuracy
- Training time: 0.01s (Naive Bayes) to 8s (MLP)
- All models are very fast at inference (<1ms per sample)

**For more:**
- [Interactive Report](https://ltsach.github.io/AILearningHub/04_Natural_Language_Processing/text_classification/)
- [Download Code](https://ltsach.github.io/AILearningHub/04_Natural_Language_Processing/text_classification/pages/traditional_ml/code/bbc_news_tfidf_ml.py)