## Model Training with TinyBERT

In [1]:
!pip install transformers==4.40.1 torch==2.7.1 peft==0.4.0 optimum==1.26.1 onnxruntime==1.22.0

Collecting transformers==4.40.1
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==2.7.1
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl.metadata (21 kB)
Collecting optimum==1.26.1
  Downloading optimum-1.26.1-py3-none-any.whl.metadata (16 kB)
Collecting onnxruntime==1.22.0
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.1)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting sympy>=1.13.3 (from torch==2.7.1)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch==2.7.

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

2025-07-01 13:39:10.294684: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751377150.542760      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751377150.601660      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Load tokenizer and model
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name)



config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [4]:
# Tokenization function
def tokenize_function(examples):
    # Ensure all values are strings
    texts = [str(x) for x in examples["text"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)


In [5]:
# Load datasets
dataset = load_dataset('csv', data_files={
    'train': '/kaggle/input/sentiment-analysis-data/train.csv',
    'validation': '/kaggle/input/sentiment-analysis-data/val.csv',
    'test': '/kaggle/input/sentiment-analysis-data/test.csv'
})


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
print(dataset['train'].column_names)

['text', 'label']


In [7]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/21001 [00:00<?, ? examples/s]

Map:   0%|          | 0/2334 [00:00<?, ? examples/s]

Map:   0%|          | 0/5834 [00:00<?, ? examples/s]

In [8]:
# Model setup
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)



pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,# <-- Set to False if you're not using a supported GPU
    logging_steps=50,
    report_to="none"
)


In [11]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

In [12]:
# Train
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4482,0.446175,0.837189,0.838502
2,0.3322,0.400277,0.846187,0.849955
3,0.2833,0.322587,0.885176,0.886376
4,0.244,0.309457,0.892031,0.893131
5,0.2335,0.299599,0.898029,0.898824


TrainOutput(global_step=3285, training_loss=0.35635458192868863, metrics={'train_runtime': 161.8189, 'train_samples_per_second': 648.904, 'train_steps_per_second': 20.3, 'total_flos': 376441816147200.0, 'train_loss': 0.35635458192868863, 'epoch': 5.0})

In [13]:
# Evaluate
eval_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Test results: {eval_results}")

Test results: {'eval_loss': 0.2943633198738098, 'eval_accuracy': 0.8959547480287967, 'eval_f1': 0.8963773876807698, 'eval_runtime': 3.5219, 'eval_samples_per_second': 1656.48, 'eval_steps_per_second': 51.96, 'epoch': 5.0}


In [14]:

# Save model
model.save_pretrained("/kaggle/working/tinybert-sentiment")
tokenizer.save_pretrained("/kaggle/working/tinybert-sentiment")

('/kaggle/working/tinybert-sentiment/tokenizer_config.json',
 '/kaggle/working/tinybert-sentiment/special_tokens_map.json',
 '/kaggle/working/tinybert-sentiment/vocab.txt',
 '/kaggle/working/tinybert-sentiment/added_tokens.json',
 '/kaggle/working/tinybert-sentiment/tokenizer.json')