## Fine Tune Pretrained Model With `Trainer`

In [None]:
# 1) Loading the tokenizer & the model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

In [None]:
# 2) Loading the dataset
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/imdb")

In [None]:
# 3) Tokenize the dataset
def tokenize(example):
  """Responsible for tokenizing the dataset."""
  return tokenizer(example["text"], truncation=True)

tokenized_dataset = dataset.map(function=tokenize, batched=True)

In [None]:
# 4) Prepare & Padding the sequences
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# 5) Prepare the Hyperparameters for training & evaluation
from transformers import TrainingArguments

training_args = TrainingArguments("checkpoint", evaluation_strategy="epoch")

In [None]:
# 6) Setup the Trainer, and start training
from transformers import Trainer

train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["unsupervised"]

trainer = Trainer(model,
                  training_args,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  data_collator=data_collator,
                  tokenizer=tokenizer)
trainer.train()

In [None]:
# 7) Check how good or bad the model’s performance is
import numpy as np
import evaluate

predictions = trainer.predict(eval_dataset)

preds = np.argmax(predictions.predictions, axis=-1)

metric = evaluate.load("stanfordnlp/imdb")

metric.compute(predictions=preds, references=predictions.label_ids)