In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
!pip install torch transformers datasets scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import random

# Load AG News dataset
dataset = load_dataset("ag_news")

# Reduce dataset size to avoid memory issues
sample_size = 5000
random.seed(42)
train_dataset = dataset["train"].shuffle(seed=42).select(range(sample_size))
test_dataset = dataset["test"].shuffle(seed=42).select(range(sample_size // 10))

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

# Tokenize dataset
train_encodings = train_dataset.map(tokenize_function, batched=True)
test_encodings = test_dataset.map(tokenize_function, batched=True)

# Convert to PyTorch tensors
train_inputs = torch.tensor(train_encodings["input_ids"])
train_masks = torch.tensor(train_encodings["attention_mask"])  # Attention mask added
train_labels = torch.tensor(train_dataset["label"])

test_inputs = torch.tensor(test_encodings["input_ids"])
test_masks = torch.tensor(test_encodings["attention_mask"])  # Attention mask added
test_labels = torch.tensor(test_dataset["label"])

# Create DataLoader for batch processing
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Load pre-trained BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop with accuracy calculation
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_train, total_train = 0, 0  # Track training accuracy

    for batch in train_loader:
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks).logits  # Attention mask used
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        predictions = torch.argmax(outputs, dim=1)
        correct_train += (predictions == labels).sum().item()
        total_train += labels.size(0)

    train_accuracy = correct_train / total_train * 100
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f} - Train Accuracy: {train_accuracy:.2f}%")

    # Evaluate on test set
    model.eval()
    correct_test, total_test = 0, 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_masks, labels = batch
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_masks).logits
            predictions = torch.argmax(outputs, dim=1)
            correct_test += (predictions == labels).sum().item()
            total_test += labels.size(0)

    test_accuracy = correct_test / total_test * 100
    print(f"Epoch {epoch+1}/{epochs} - Test Accuracy: {test_accuracy:.2f}% ✅")

# Extract BERT embeddings for training SVC
def get_bert_embeddings(dataloader):
    model.eval()
    embeddings = []
    labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_masks, batch_labels = batch
            input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)
            output = model(input_ids, attention_mask=attention_masks).logits.cpu().numpy()
            embeddings.append(output)
            labels.extend(batch_labels.numpy())
    return np.vstack(embeddings), np.array(labels)

# Get embeddings for train and test sets
train_embeddings, train_labels = get_bert_embeddings(train_loader)
test_embeddings, test_labels = get_bert_embeddings(test_loader)

# Train Linear SVC classifier
svc = LinearSVC()
svc.fit(train_embeddings, train_labels)

# Predict on test set
test_predictions = svc.predict(test_embeddings)

# Evaluate Performance
accuracy = accuracy_score(test_labels, test_predictions)
print(f"Final Test Accuracy (SVC): {accuracy * 100:.2f}% ✅")

# Function to predict news category
def predict_news(news_text):
    model.eval()
    inputs = tokenizer(news_text, padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits.cpu().numpy()

    prediction = svc.predict(logits)[0]

    label_mapping = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

    return label_mapping[prediction]

# Example news articles
news_samples = [
    "NASA is preparing a new space mission to Mars.",
    "The stock market saw a significant rise today in the tech sector.",
    "The local football team won their championship game in overtime.",
    "World leaders are gathering for a crucial climate summit next week.",
    "A new smartphone with AI capabilities was just released by a major company.",
    "Eagles won the Superbowl in 2025",
    "The Federal Reserve announced a change in interest rates to stabilize the economy.",
    "A powerful earthquake struck the coastal region, prompting international aid efforts.",
    "The United Nations held an emergency meeting to discuss global security threats.",
    "The discovery of a new habitable exoplanet raises hopes for future space colonization, say international researchers."
]

# Print predictions
for news in news_samples:
    print(f"News: {news}\nPredicted Category: {predict_news(news)}\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Loss: 132.6496 - Train Accuracy: 85.92%
Epoch 1/3 - Test Accuracy: 88.40% ✅
Epoch 2/3 - Loss: 66.8915 - Train Accuracy: 93.34%
Epoch 2/3 - Test Accuracy: 88.80% ✅
Epoch 3/3 - Loss: 36.7600 - Train Accuracy: 96.40%
Epoch 3/3 - Test Accuracy: 89.40% ✅
Final Test Accuracy (SVC): 89.40% ✅
News: NASA is preparing a new space mission to Mars.
Predicted Category: Sci/Tech

News: The stock market saw a significant rise today in the tech sector.
Predicted Category: Sci/Tech

News: The local football team won their championship game in overtime.
Predicted Category: Sports

News: World leaders are gathering for a crucial climate summit next week.
Predicted Category: Sci/Tech

News: A new smartphone with AI capabilities was just released by a major company.
Predicted Category: Sci/Tech

News: Eagles won the Superbowl in 2025
Predicted Category: Sports

News: The Federal Reserve announced a change in interest rates to stabilize the economy.
Predicted Category: Business

News: A powerful