In [1]:
! pip install datasets transformers[sentencepiece]


[notice] A new release of pip available: 22.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# print(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import TrainingArguments # Used to fine tune the training for a model

training_args = TrainingArguments("test-trainer")

In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01, # regularization technique. prevents overfitting and keeps the weights small to avoid exploding gradient
)

In [6]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args, # Defines the Training Parameters to update the NN (Neural Network)
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train() # Note we are not seeing accuracy or f1 because we do not have a validation set being used

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.3982
1000,0.098


TrainOutput(global_step=1150, training_loss=0.22155779465385106, metrics={'train_runtime': 94.2035, 'train_samples_per_second': 194.685, 'train_steps_per_second': 12.208, 'total_flos': 743901286817760.0, 'train_loss': 0.22155779465385106, 'epoch': 5.0})

In [7]:
predictions = trainer.predict(tokenized_datasets["validation"]) # Predict method allows us to get the predictions of the model based on the entire dataset
print(predictions.predictions.shape, predictions.label_ids.shape)
print(predictions.predictions)
print(predictions.label_ids)

(408, 2) (408,)
[[-2.677076    3.0280995 ]
 [ 2.8259056  -3.0882368 ]
 [ 2.4144921  -2.9509084 ]
 [-2.6257489   3.0335543 ]
 [ 2.3152938  -2.938144  ]
 [-2.660738    3.0578988 ]
 [-2.4568846   2.9711285 ]
 [-2.6893346   2.948603  ]
 [-2.5630639   2.9925158 ]
 [-2.6733344   3.0812387 ]
 [-2.6952991   3.0866091 ]
 [ 3.0237331  -2.9747393 ]
 [ 2.1961634  -2.953766  ]
 [-0.263542    0.34998614]
 [-2.678567    3.0798464 ]
 [ 2.288944   -2.3680923 ]
 [-2.6846983   3.091218  ]
 [ 2.8327127  -3.1319213 ]
 [-2.6582778   3.0509624 ]
 [-0.502315    0.7085883 ]
 [ 2.4029336  -2.9372969 ]
 [-2.523209    3.0015824 ]
 [ 2.147504   -2.1778803 ]
 [-2.6857212   3.0682604 ]
 [-2.6612973   3.049786  ]
 [ 1.6274558  -1.9360296 ]
 [-2.6061559   2.78589   ]
 [-2.6707876   3.093423  ]
 [-2.6802852   2.9937575 ]
 [-2.6880395   3.032159  ]
 [-0.84358394  0.707124  ]
 [-2.6961794   3.0562732 ]
 [-2.6191542   2.9624639 ]
 [-2.5617514   3.0120604 ]
 [-2.3586142   2.5132258 ]
 [-2.6940022   3.0646696 ]
 [ 2.2443101

In [8]:
import numpy as np
from datasets import load_metric

metric = load_metric("glue", "mrpc")
preds = np.argmax(predictions.predictions, axis=-1) # Max logit is taken to predict the which class was predicted
print(preds)
metric.compute(predictions=preds, references=predictions.label_ids)

  metric = load_metric("glue", "mrpc")


[1 0 0 1 0 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0
 0 1 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1
 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 0 1 1 1 1 0 1 0 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 0 1 1 1
 1 0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 0 1 1 1 0
 0 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0
 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0
 1]


{'accuracy': 0.8578431372549019, 'f1': 0.901023890784983}

In [9]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.429624,0.789216,0.828685
2,0.545400,0.460905,0.833333,0.887789
3,0.337800,0.603157,0.865196,0.904014


TrainOutput(global_step=1377, training_loss=0.3668887457643286, metrics={'train_runtime': 80.4562, 'train_samples_per_second': 136.77, 'train_steps_per_second': 17.115, 'total_flos': 419378466692640.0, 'train_loss': 0.3668887457643286, 'epoch': 3.0})