In [None]:

!pip install transformers datasets

import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import transformers
from google.colab import drive

# mount the drive
drive.mount('/content/drive')

# loading dataset
dataset = load_dataset("ag_news")
print(dataset)


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# tokenize function
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# applying mapping
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# setting data format
tokenized_dataset.set_format("torch")

# defining model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4
)

# setting training arguments
training_args = TrainingArguments(
    output_dir="/content/results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=100
)

# compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

# Initializing trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

# fine tuning start
trainer.train()

# evaluating
trainer.evaluate()
# drive save path
drive_save_path = "/content/drive/MyDrive/fine_tuned_model"
# saving to drive
trainer.model.save_pretrained(drive_save_path)
tokenizer.save_pretrained(drive_save_path)

print(f"Model and tokenizer successfully saved to: {drive_save_path}")

Transformers version: 5.0.0
Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]



data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Step,Training Loss
100,0.795013
200,0.421669
300,0.417306
400,0.374497
500,0.401591
600,0.40099
700,0.283885
800,0.376141
900,0.372
1000,0.30468


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model and tokenizer successfully saved to: /content/drive/MyDrive/fine_tuned_model


In [4]:
# loading from drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
drive.mount('/content/drive')
drive_save_path = "/content/drive/MyDrive/fine_tuned_model"

tokenizer = AutoTokenizer.from_pretrained(drive_save_path)
model = AutoModelForSequenceClassification.from_pretrained(drive_save_path)

Mounted at /content/drive


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [5]:
# loading tokenizer
dataset = load_dataset("ag_news")

# tokenize function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]



data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [6]:
# compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

In [7]:
# temporary trainer for evaluation
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/fine_tuned_model")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [9]:

train_metrics = trainer.evaluate(tokenized_dataset["train"])
print("Training Metrics:", train_metrics)
val_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Validation Metrics:", val_metrics)

Training Metrics: {'eval_loss': 0.09239621460437775, 'eval_model_preparation_time': 0.0085, 'eval_accuracy': 0.9757583333333333, 'eval_f1': 0.9757691900929406, 'eval_runtime': 913.9438, 'eval_samples_per_second': 131.299, 'eval_steps_per_second': 16.412}
Validation Metrics: {'eval_loss': 0.21950924396514893, 'eval_model_preparation_time': 0.0085, 'eval_accuracy': 0.9492105263157895, 'eval_f1': 0.9492466646998725, 'eval_runtime': 62.289, 'eval_samples_per_second': 122.012, 'eval_steps_per_second': 15.251}


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
model_path = "/content/drive/MyDrive/fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
text = "Apple is releasing new products this fall."

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

with torch.no_grad():
    outputs = model(**inputs)


logits = outputs.logits
predicted_class_id = torch.argmax(logits, dim=1).item()


labels = ["World", "Sports", "Business", "Sci/Tech"]
predicted_label = labels[predicted_class_id]

print(f"Sentence: {text}")
print(f"Predicted label: {predicted_label}")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Sentence: Apple is releasing new products this fall.
Predicted label: Sci/Tech
