In [1]:
!pip install transformers datasets scikit-learn torch gradio --quiet
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from tqdm import tqdm
import torch
import gradio as gr

# ✅ Load AG News dataset (NO streaming)
train_dataset = load_dataset("ag_news", split="train[:80%]")
test_dataset = load_dataset("ag_news", split="train[80%:]")

# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# ✅ Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# ✅ Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# ✅ Rename for PyTorch compatibility
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# ✅ DataLoaders
train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True)
test_loader = DataLoader(tokenized_test, batch_size=8)

# ✅ Model, optimizer, loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

# ✅ Training loop (first 100 batches only for demo)
model.train()
for i, batch in enumerate(tqdm(train_loader, total=100)):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i >= 99:
        break
# ✅ Prediction function
def predict_topic(text):
    tokens = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
    tokens = {k: v.to(device) for k, v in tokens.items()}
    with torch.no_grad():
        outputs = model(**tokens)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    labels = ["World", "Sports", "Business", "Sci/Tech"]
    return labels[prediction]

# ✅ Gradio Interface
gr.Interface(
    fn=predict_topic,
    inputs=gr.Textbox(lines=2, placeholder="Enter news headline..."),
    outputs="text",
    title="News Topic Classifier (BERT + AG News)"
).launch()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 1.23M/1.23M [00:00<00:00, 2.67MB/s]

Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 18.6M/18.6M [00:00<00:00, 72.8MB/s]


Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported.