In [2]:
! pip install datasets transformers[sentencepiece]


[notice] A new release of pip available: 22.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

In [4]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# print(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import TrainingArguments # Used to fine tune the training for a model

training_args = TrainingArguments("test-trainer")

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01, # regularization technique. prevents overfitting and keeps the weights small to avoid exploding gradient
)

In [7]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args, # Defines the Training Parameters to update the NN (Neural Network)
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train() # Note we are not seeing accuracy or f1 because we do not have a validation set being used

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.472
1000,0.1887


TrainOutput(global_step=1150, training_loss=0.3009665580417799, metrics={'train_runtime': 92.7948, 'train_samples_per_second': 197.64, 'train_steps_per_second': 12.393, 'total_flos': 743901286817760.0, 'train_loss': 0.3009665580417799, 'epoch': 5.0})

In [12]:
predictions = trainer.predict(tokenized_datasets["validation"]) # Predict method allows us to get the predictions of the model based on the entire dataset
print(predictions.predictions.shape, predictions.label_ids.shape)
print(predictions.predictions)
print(predictions.label_ids)

(408, 2) (408,)
[[-3.7294462   3.5800664 ]
 [ 2.2895823  -2.762918  ]
 [ 2.3971639  -2.7603602 ]
 [-3.4544723   3.3419147 ]
 [ 2.3377914  -2.8346415 ]
 [-3.6039479   3.4062026 ]
 [-3.4565754   3.239911  ]
 [-3.5567927   3.4100459 ]
 [-3.5135212   3.4019623 ]
 [-3.7371356   3.6474023 ]
 [-3.7279482   3.6426263 ]
 [ 2.3157458  -2.623676  ]
 [ 1.1354722  -1.4750443 ]
 [-3.2140887   3.0884614 ]
 [-3.6602576   3.4905145 ]
 [ 1.3785053  -1.7331703 ]
 [-3.5494585   3.4154987 ]
 [ 1.0237976  -1.3907193 ]
 [-3.7088525   3.55927   ]
 [ 2.4243627  -2.7525692 ]
 [ 1.9734621  -2.2701147 ]
 [-3.3525794   3.202755  ]
 [ 2.067732   -2.6602693 ]
 [-3.7118523   3.521712  ]
 [-3.3663173   3.1977322 ]
 [-2.5777876   2.4168923 ]
 [-2.499741    2.2923636 ]
 [-3.7272248   3.5977354 ]
 [-3.128612    2.8779225 ]
 [-3.5742106   3.4678948 ]
 [ 1.72036    -2.2012115 ]
 [-3.649586    3.4632852 ]
 [-3.4231124   3.2625098 ]
 [-3.3250952   3.1777716 ]
 [-3.6424305   3.5893795 ]
 [-3.3338046   3.2462592 ]
 [ 1.516792 

In [13]:
import numpy as np
from datasets import load_metric

metric = load_metric("glue", "mrpc")
preds = np.argmax(predictions.predictions, axis=-1) # Max logit is taken to predict the which class was predicted
print(preds)
metric.compute(predictions=preds, references=predictions.label_ids)

[1 0 0 1 0 1 1 1 1 1 1 0 0 1 1 0 1 0 1 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0
 0 0 1 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1
 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 0 1 1 1 1 0 1 0 1 1 1
 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1
 1 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1
 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 0 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 0
 0 1 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0
 1]


{'accuracy': 0.8553921568627451, 'f1': 0.8987993138936535}

In [14]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.469034,0.808824,0.86121
2,0.558100,0.62308,0.794118,0.867089
3,0.360600,0.617194,0.857843,0.900685


TrainOutput(global_step=1377, training_loss=0.39661268401855826, metrics={'train_runtime': 75.3841, 'train_samples_per_second': 145.972, 'train_steps_per_second': 18.266, 'total_flos': 419378466692640.0, 'train_loss': 0.39661268401855826, 'epoch': 3.0})