# Fine-tuning a model with the Trainer API

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [2]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [5]:
trainer.train()



Step,Training Loss
500,0.5128
1000,0.2618




TrainOutput(global_step=1377, training_loss=0.3130852000472679, metrics={'train_runtime': 244.4419, 'train_samples_per_second': 45.017, 'train_steps_per_second': 5.633, 'total_flos': 405114969714960.0, 'train_loss': 0.3130852000472679, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)



(408, 2) (408,)


In [9]:
print(predictions[0])
import numpy as np

# Convert predictions to text labels
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Define label mapping (assuming 0=negative, 1=positive for binary classification)
label_map = {0: "negative", 1: "positive"}

# Convert numeric predictions to text
predicted_text = [label_map[label] for label in predicted_labels]

print("Predicted labels (text):", predicted_text[:10])  # Show first 10 predictions
print("Actual labels:", predictions.label_ids[:10])     # Show first 10 actual labels


[[-4.3125167   3.397542  ]
 [ 3.434176   -3.1164083 ]
 [ 2.549814   -2.249127  ]
 [-4.10897     3.1781025 ]
 [ 3.175527   -2.9371104 ]
 [-4.0245776   3.133544  ]
 [-3.0331767   2.2694478 ]
 [-4.1991687   3.283338  ]
 [-4.0933666   3.174333  ]
 [-4.280208    3.3939693 ]
 [-4.187856    3.2638555 ]
 [ 3.1788187  -2.947845  ]
 [ 2.97925    -2.523234  ]
 [-4.248465    3.332658  ]
 [-4.310426    3.3955853 ]
 [ 2.1280303  -2.1245897 ]
 [-4.3525424   3.4757295 ]
 [ 3.018062   -2.5658617 ]
 [-4.319159    3.383936  ]
 [-1.4124635   0.8516994 ]
 [ 3.238174   -2.9236133 ]
 [-3.6577978   2.8079352 ]
 [-1.6399399   1.138107  ]
 [-4.293837    3.4266586 ]
 [-4.0503545   3.1629257 ]
 [-3.3685963   2.757372  ]
 [ 1.0169362  -1.0440799 ]
 [-4.3235393   3.4449766 ]
 [-4.2300477   3.3028662 ]
 [-4.322883    3.4603233 ]
 [-1.3997769   0.8140237 ]
 [-4.3030267   3.3750987 ]
 [-3.8932664   3.0557518 ]
 [-4.051108    3.1978698 ]
 [-4.211636    3.2865152 ]
 [-4.019048    3.0803156 ]
 [ 3.23152    -2.99284   ]
 

In [13]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8602941176470589, 'f1': 0.9018932874354562}

In [14]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:
training_args = TrainingArguments("test-trainer", eval_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.468281,0.762255,0.8438
2,0.591300,0.435301,0.835784,0.883072
3,0.371600,0.692246,0.845588,0.894118




TrainOutput(global_step=1377, training_loss=0.40628136528862846, metrics={'train_runtime': 253.7216, 'train_samples_per_second': 43.37, 'train_steps_per_second': 5.427, 'total_flos': 405114969714960.0, 'train_loss': 0.40628136528862846, 'epoch': 3.0})