# Installing Data set library

In [1]:
!pip install datasets



In [2]:
!pip install transformers



# Loading AG news Dataset From Hugging Face

In [3]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


# Tokenizer & Preprocessing


In [4]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

# Model Defination

In [5]:
from transformers import AutoModelForSequenceClassification

num_labels = 4  # AG News has 4 classes
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training setup

In [12]:
from transformers import TrainingArguments, Trainer
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels),
        "f1": f1.compute(predictions=preds, references=labels, average="weighted"),
    }
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,  # You can increase to 3-4 if GPU allows
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False,
    report_to="none", # Disable wandb logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [7]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


Performing Fine Tuning

In [13]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1752,0.174511,{'accuracy': 0.9472368421052632},{'f1': 0.9472337642069727}
2,0.1131,0.188218,{'accuracy': 0.95},{'f1': 0.9500182486382102}


TrainOutput(global_step=15000, training_loss=0.17536216071446736, metrics={'train_runtime': 5159.328, 'train_samples_per_second': 46.518, 'train_steps_per_second': 2.907, 'total_flos': 1.578694680576e+16, 'train_loss': 0.17536216071446736, 'epoch': 2.0})

# Evaluate Model

In [14]:
results = trainer.evaluate()
print("Final Results:", results)

Final Results: {'eval_loss': 0.1882183700799942, 'eval_accuracy': {'accuracy': 0.95}, 'eval_f1': {'f1': 0.9500182486382102}, 'eval_runtime': 54.2402, 'eval_samples_per_second': 140.117, 'eval_steps_per_second': 8.757, 'epoch': 2.0}


# Saving Model

In [17]:
trainer.save_model("bert-agnews")
tokenizer.save_pretrained("bert-agnews")

('bert-agnews/tokenizer_config.json',
 'bert-agnews/special_tokens_map.json',
 'bert-agnews/vocab.txt',
 'bert-agnews/added_tokens.json',
 'bert-agnews/tokenizer.json')

> Creating zip file

In [18]:
!zip -r bert-agnews.zip bert-agnews


  adding: bert-agnews/ (stored 0%)
  adding: bert-agnews/tokenizer_config.json (deflated 75%)
  adding: bert-agnews/special_tokens_map.json (deflated 42%)
  adding: bert-agnews/vocab.txt (deflated 53%)
  adding: bert-agnews/training_args.bin (deflated 54%)
  adding: bert-agnews/config.json (deflated 52%)
  adding: bert-agnews/tokenizer.json (deflated 71%)
  adding: bert-agnews/model.safetensors (deflated 7%)


In [19]:
from google.colab import files
files.download("bert-agnews.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# -------------------------------
# Load Model
# -------------------------------
MODEL_PATH = "bert-agnews"  # your fine-tuned model folder

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# -------------------------------
# Label Mapping
# -------------------------------
label_map = {
    "LABEL_0": "World üåç",
    "LABEL_1": "Sports üèÜ",
    "LABEL_2": "Business üíº",
    "LABEL_3": "Sci/Tech üî¨"
}

# -------------------------------
# Prediction Function
# -------------------------------
def predict_news(text):
    result = classifier(text)[0]
    label = label_map.get(result["label"], result["label"])
    return f"{label}  (Confidence: {result['score']:.2f})"

# -------------------------------
# Gradio UI
# -------------------------------
demo = gr.Interface(
    fn=predict_news,
    inputs=gr.Textbox(lines=2, placeholder="Enter a news headline here..."),
    outputs="text",
    title="üì∞ News Topic Classifier",
    description="Classify news headlines into categories: World, Sports, Business, Sci/Tech."
)

demo.launch()


Device set to use cuda:0


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://45ae49fc3a3ff42317.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


