## Model Training with TinyBERT

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


In [2]:
# Load tokenizer and model
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [10]:
# Tokenization function
def tokenize_function(examples):
    # Ensure all values are strings
    texts = [str(x) for x in examples["text"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)


In [5]:
# Load datasets
dataset = load_dataset('csv', data_files={
    'train': '../processed-datasets/train.csv',
    'validation': '../processed-datasets/val.csv',
    'test': '../processed-datasets/test.csv'
})


Generating train split: 21001 examples [00:00, 643577.91 examples/s]
Generating validation split: 2334 examples [00:00, 367625.73 examples/s]
Generating test split: 5834 examples [00:00, 621828.40 examples/s]


In [7]:
print(dataset['train'].column_names)

['text', 'label']


In [9]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 21001/21001 [00:01<00:00, 16841.89 examples/s]
Map: 100%|██████████| 2334/2334 [00:00<00:00, 20185.42 examples/s]
Map: 100%|██████████| 5834/5834 [00:00<00:00, 20552.35 examples/s]


In [11]:
# Model setup
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=False,# <-- Set to False if you're not using a supported GPU
    logging_steps=50,
    report_to="none"
)


In [15]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

In [16]:
# Train
trainer.train()

  0%|          | 12/3285 [00:10<49:08,  1.11it/s] 

KeyboardInterrupt: 