# 🏋️‍♂️ **Training DistilBERT on 20 Newsgroups Dataset**
This notebook trains a **DistilBERT model** for text classification using the **20 Newsgroups dataset**.

### **🔹 Training Steps:**
1. **Load & Preprocess Data**: We retrieve the dataset, assign labels, and tokenize the text.
2. **Fine-Tune DistilBERT**: We use a pre-trained **DistilBERT model** and fine-tune it for classification.
3. **Train on GPU**: The model trains for **3 epochs** with batch size **16**.
4. **Save the Model**: The trained model is saved as `distilbert_newsgroup_model`.

### **🚀 Expected Training Time**
- **T4 GPU**: ~**10-15 minutes**  
- **A100 GPU** (if available): ~**5-10 minutes**  

After training, the model will be used for evaluation and deployment in a web application.  

In [3]:

import torch
import os
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, concatenate_datasets
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"

# Define available categories with numeric labels
categories = {
    "18828_alt.atheism": 0, "18828_comp.graphics": 1, "18828_comp.os.ms-windows.misc": 2,
    "18828_comp.sys.ibm.pc.hardware": 3, "18828_comp.sys.mac.hardware": 4, "18828_comp.windows.x": 5,
    "18828_misc.forsale": 6, "18828_rec.autos": 7, "18828_rec.motorcycles": 8, "18828_rec.sport.baseball": 9,
    "18828_rec.sport.hockey": 10, "18828_sci.crypt": 11, "18828_sci.electronics": 12, "18828_sci.med": 13,
    "18828_sci.space": 14, "18828_soc.religion.christian": 15, "18828_talk.politics.guns": 16,
    "18828_talk.politics.mideast": 17, "18828_talk.politics.misc": 18, "18828_talk.religion.misc": 19
}

# Load all categories separately and assign numeric labels
datasets = []
for category, label in categories.items():
    ds = load_dataset("newsgroup", name=category, split="train")
    ds = ds.add_column("label", [label] * len(ds))  # Assign numeric label
    datasets.append(ds)

# Concatenate all datasets
dataset = concatenate_datasets(datasets)

# Convert dataset to DataFrame
df = pd.DataFrame({'text': dataset['text'], 'label': dataset['label']})

# Split into train and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512)

# Tokenize datasets
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

# Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(categories))

# Move model to GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Custom logging callback
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and state.global_step % 10 == 0:  # Log every 10 steps
            print(f"Step {state.global_step}: {logs}")

# Training arguments for GPU
training_args = TrainingArguments(
    output_dir="./results", eval_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=16,
    per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, push_to_hub=False,
    report_to="none", logging_dir="./logs", logging_steps=10, disable_tqdm=False)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer setup
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset,
    tokenizer=tokenizer, data_collator=data_collator, callbacks=[LogCallback()]
)

# Train the model
print("Starting training...")
start_time = time.time()
trainer.train()
print(f"Training complete! Total time: {time.time() - start_time:.2f} seconds")

# Save the model
model.save_pretrained("distilbert_newsgroup_model")
tokenizer.save_pretrained("distilbert_newsgroup_model")

print("Model saved!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

newsgroup.py:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

The repository for newsgroup contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/newsgroup.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/14.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/799 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/973 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/985 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/982 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/961 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/980 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/972 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/990 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/994 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/994 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/999 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/991 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/981 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/990 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/987 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/997 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/910 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/940 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/775 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/628 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss
1,0.5184,0.419261
2,0.1504,0.321393
3,0.1562,0.32108


Step 10: {'loss': 2.9993, 'grad_norm': 1.5778552293777466, 'learning_rate': 4.982307147912243e-05, 'epoch': 0.010615711252653927}
Step 20: {'loss': 2.9732, 'grad_norm': 1.9179433584213257, 'learning_rate': 4.964614295824487e-05, 'epoch': 0.021231422505307854}
Step 30: {'loss': 2.8978, 'grad_norm': 2.688845157623291, 'learning_rate': 4.946921443736731e-05, 'epoch': 0.03184713375796178}
Step 40: {'loss': 2.7232, 'grad_norm': 3.7829861640930176, 'learning_rate': 4.929228591648974e-05, 'epoch': 0.04246284501061571}
Step 50: {'loss': 2.4865, 'grad_norm': 4.337266445159912, 'learning_rate': 4.9115357395612176e-05, 'epoch': 0.05307855626326964}
Step 60: {'loss': 2.2575, 'grad_norm': 3.9524972438812256, 'learning_rate': 4.893842887473461e-05, 'epoch': 0.06369426751592357}
Step 70: {'loss': 1.9895, 'grad_norm': 4.5480732917785645, 'learning_rate': 4.8761500353857044e-05, 'epoch': 0.07430997876857749}
Step 80: {'loss': 1.8792, 'grad_norm': 5.579575061798096, 'learning_rate': 4.858457183297948e-0

# 📊 **Model Evaluation: DistilBERT on 20 Newsgroups**
This section evaluates our **fine-tuned DistilBERT model** on the validation dataset.

### **🔹 Evaluation Metrics:**
✅ **Accuracy** - How often the model predicts correctly.  
✅ **Precision** - How many predicted positives are actually correct.  
✅ **Recall** - How many actual positives are correctly predicted.  
✅ **F1 Score** - The balance between Precision & Recall.  

### **🚀 Expected Performance**
Based on the training run, we expect:
- **Accuracy**: ~**92.22%**
- **Precision**: ~**92.34%**
- **Recall**: ~**92.22%**
- **F1 Score**: ~**92.25%**

The model is now ready for **real-world text classification tasks**! 🎯  


In [2]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import load_dataset, concatenate_datasets
from sklearn.model_selection import train_test_split
import pandas as pd

# Define available categories with numeric labels
categories = {
    "18828_alt.atheism": 0, "18828_comp.graphics": 1, "18828_comp.os.ms-windows.misc": 2,
    "18828_comp.sys.ibm.pc.hardware": 3, "18828_comp.sys.mac.hardware": 4, "18828_comp.windows.x": 5,
    "18828_misc.forsale": 6, "18828_rec.autos": 7, "18828_rec.motorcycles": 8, "18828_rec.sport.baseball": 9,
    "18828_rec.sport.hockey": 10, "18828_sci.crypt": 11, "18828_sci.electronics": 12, "18828_sci.med": 13,
    "18828_sci.space": 14, "18828_soc.religion.christian": 15, "18828_talk.politics.guns": 16,
    "18828_talk.politics.mideast": 17, "18828_talk.politics.misc": 18, "18828_talk.religion.misc": 19
}

# Reload dataset
datasets = []
for category, label in categories.items():
    ds = load_dataset("newsgroup", name=category, split="train")
    ds = ds.add_column("label", [label] * len(ds))  # Assign numeric label
    datasets.append(ds)

# Concatenate all datasets
dataset = concatenate_datasets(datasets)

# Convert dataset to DataFrame
df = pd.DataFrame({'text': dataset['text'], 'label': dataset['label']})

# Split dataset (same as in training)
_, val_texts, _, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

# Load trained model and tokenizer
model_path = "distilbert_newsgroup_model"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize validation dataset in batches
batch_size = 8  # Reduce if OOM persists
predictions, true_labels = [], []

for i in range(0, len(val_texts), batch_size):
    batch_texts = val_texts[i : i + batch_size]
    batch_labels = val_labels[i : i + batch_size]

    encodings = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    labels = torch.tensor(batch_labels).to(device)

    # Run inference
    model.eval()
    with torch.no_grad():
        logits = model(**encodings).logits
        preds = torch.argmax(logits, dim=-1)

    predictions.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average="weighted")

# Print evaluation results
print("Evaluation Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Evaluation Results:
Accuracy: 0.9222
Precision: 0.9234
Recall: 0.9222
F1 Score: 0.9225
