In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

In [4]:
dataset = load_dataset("ag_news")
dataset

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [7]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7600
    })
})

In [8]:
import os

model_path = "./news_classifier_model"
if os.path.exists(model_path):
    print("Loading existing model...")
    model = BertForSequenceClassification.from_pretrained(model_path)
else:
    print("Training new model...")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
model.to(device)

Training new model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [28]:
import os
os.environ["WANDB_DISABLED"] = "true"  # disables wandb

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy="epoch",
    logging_steps=10,
    save_total_limit=1,
    dataloader_num_workers=4,
    fp16=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [33]:
!ls

results  sample_data


In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }


In [29]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)


In [30]:
trainer.train()



Step,Training Loss
10,0.003
20,0.0421
30,0.0003
40,0.0002
50,0.0161
60,0.0552
70,0.0002
80,0.0003
90,0.0508
100,0.0074


TrainOutput(global_step=7500, training_loss=0.1970801444729169, metrics={'train_runtime': 932.3973, 'train_samples_per_second': 128.7, 'train_steps_per_second': 8.044, 'total_flos': 7893473402880000.0, 'train_loss': 0.1970801444729169, 'epoch': 1.0})

In [35]:
# Save model & tokenizer
model.save_pretrained("./results")
tokenizer.save_pretrained("./results")

# Evaluate
metrics = trainer.evaluate()
print(f"Accuracy: {metrics['eval_accuracy']:.4f}")
print(f"F1 Score: {metrics['eval_f1']:.4f}")



Accuracy: 0.9462
F1 Score: 0.9462


In [38]:
label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

headline = "NASA launches new satellite to monitor climate change"

# Move inputs to same device as model
inputs = tokenizer(headline, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(model.device) for k, v in inputs.items()}  #  Move to GPU if model is on CUDA

with torch.no_grad():
    outputs = model(**inputs)

predicted_class = outputs.logits.argmax().item()
print("Predicted Topic:", label_map[predicted_class])


Predicted Topic: Sci/Tech


In [41]:
%%writefile app.py
import streamlit as st
import torch
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("news_classifier_model")
tokenizer = BertTokenizer.from_pretrained("news_classifier_model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

st.title("📰 News Classifier")

headline = st.text_input("Enter a news headline")

if st.button("Classify"):
    if not headline.strip():
        st.warning("Please enter something.")
    else:
        inputs = tokenizer(headline, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        pred = outputs.logits.argmax().item()
        st.success(f"Predicted Topic: {label_map[pred]}")


Writing app.py


In [None]:
from pyngrok import ngrok

# Open a public URL for the app
public_url = ngrok.connect(port=8501)
print("Open this Streamlit app:", public_url)

# Run the app
!streamlit run app.py &> /dev/null &
