In [19]:
!pip install -q --upgrade transformers torch torchvision torchaudio
!pip install -q tokenizers==0.13.3 evaluate
!pip install -q bitsandbytes transformers accelerate gradio thread6

[0m

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

#### Transformers provide a 'Trainer' class to help you fine-tune any of the pretrained models to provides on your dataset.
   * Based off what we've done in the last few chapters we just have a few steps left to to define the Trainer.
   * The hardest part is liekly to be preparing the enviornment to run 'Trainer.train()' on a GPU
   

In [1]:
# The code is a columantion of everything we've done so far
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

#### The first step is to define a 'TrainingArguments' class that will contain all the hyperparameters the 'Trainer' will use for training and evaluation.
#### The only thing you have to provide is the directory where the model will be saved


In [4]:
# define our 'TrainingArguments' 
from transformers import TrainingArguments

training_args = TrainingArguments('test-trainer')

In [5]:
# Define our model

# The warning is due to this bert model not being 'pre-trained' on classifying sentences
# The head has been discarded and a new head suitable for sequence classification has been added instead.
# The warning also indicates that some weights were not used (the ones corresponding to the dropped head) and that others were randomly initialized(new head)

# It's basically telling you to train it since it doesn't have its head
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# we can now pass all the objects we've constructed up to now
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
)

# NOTE: we can skip the line with the data_collator=data_collator
# The tokenizer argument automatically passes DataCollatorWithPadding so its redundent but necessary for learning

In [7]:
# To fine tune the model we simply call the 'train()' method on our 'Trainer'

# This will fine-tune the model on our dataset but won't tell how its performing
    # We didn't tell the 'Trainer' to evaluate during training by setting evaluation_strategy to either 'steps' (evaluate every eval_steps) or "epochs" (eval at end of epoch)
    # We didn't provide the trainer with a 'compute_metrics()' function to calculate a metric during said evaluation(otherwise the eval would just print the loss)
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.5206
1000,0.2829


TrainOutput(global_step=1377, training_loss=0.32630309282910935, metrics={'train_runtime': 137.5855, 'train_samples_per_second': 79.979, 'train_steps_per_second': 10.008, 'total_flos': 405324636337200.0, 'train_loss': 0.32630309282910935, 'epoch': 3.0})

### The above is OUTDATED

### HuggingFace's 'train()' method now uses 'wandb'(weights and biases) to display and store training data automatically

### ___________________________________________________________________________________________________________________
### Evaluation:
### Let's build a compute_metrics() function and use it the next time we train.
   * The function must take an 'EvalPrediction' object (which is a named tuple with a predicitons field and a label_ids field) and will return a dictionary mapping strings to floats (the strings being names of the metrics returned, and the floats their values)

In [8]:
# We can use the Trainer.predict() command
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)


### The output of the predict() method is a named tuple with three fields:
   * predicitons
   * label_ids
   * metrics - this will just contain the loss of the dataset passed (how long it took to predict, in total and on average)
   
### As you can see 'predictions' is a two-dimensional array with a shape 408 x 2 (408 being the number of elements in the dataset we used)
### To transform the them into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis

In [17]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

# so the above for each of the 408 elements in the dataset, it's determining which class (0 or 1) the model predicts.
# we verify it passed through all values with this code
np.count_nonzero(preds) + np.count_nonzero(preds == 0)

408

#### We're going to build our 'compute_metric()' function, we'll rely on the metric from the 🤗 Evaluate library.
   * We can load the metrics assoicated with the MRPC dataset as easitly as we loaded the datset, this time with the 'evaluate.load()' function'
   * The object returned has a 'compute' method we can use to do the metric calculation

In [20]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions = preds, references = predictions.label_ids)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8602941176470589, 'f1': 0.9015544041450777}

#### The above results may vary, as the random intialization of the model head might change the metrics it achieved. 
   * Here we can see our model achieved an accuracy score of 86.03% on the validation set
   * It also achieved a 90.16% F1 score 
   * The table in the BERT paper reported an F1 score of 88.9% for the base model
   * The paper used the 'uncased' model while we used the 'cased' which explains the better results

In [21]:
# we wrap everything together to get our 'compute_metrics()" function
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions = predictions, references = labels)

In [22]:
# The function will report metrics at the end of each epoch
# To use our new 'compute_metrics' function we do the following
training_args = TrainingArguments("test-trainer", evaluation_strategy = "epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Note that we created a new 'TrainingArguments' with its 'evaluation_strategy' set to 'epoch' and a new model 
# otherwise , we would just be continuing the training of the model we have already trained
# We launch a new run
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.450935,0.772059,0.843697
2,0.590400,0.541379,0.79902,0.867742
3,0.384500,0.587679,0.857843,0.898246


TrainOutput(global_step=1377, training_loss=0.41986747193180657, metrics={'train_runtime': 114.8315, 'train_samples_per_second': 95.827, 'train_steps_per_second': 11.991, 'total_flos': 405540469624800.0, 'train_loss': 0.41986747193180657, 'epoch': 3.0})

In [25]:
# Exercise 

# load and tokenize dataset
# model is already loaded

exraw_datasets = load_dataset("glue", "sst2")

# we make a new tokenized function since the sst2 dataset only has 1 sentence
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

extokenized_datasets = exraw_datasets.map(tokenize_function, batched=True)
exdata_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [27]:
# Exercise-2

# we make a metrics function for the trainer object
def excompute_metrics(eval_preds):
    metric = evaluate.load("glue", "sst2")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions = predictions, references = labels)

In [28]:
# Exercise-3

# we make the trainer
training_args = TrainingArguments("test-trainer", evaluation_strategy = "epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

trainer = Trainer(
    model,
    training_args,
    train_dataset = extokenized_datasets["train"],
    eval_dataset = extokenized_datasets["validation"],
    data_collator = exdata_collator,
    tokenizer = tokenizer,
    compute_metrics = excompute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Exercise-4
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2239,0.522244,0.87156
2,0.146,0.473235,0.885321
3,0.1102,0.487375,0.893349


TrainOutput(global_step=25257, training_loss=0.1582507510958731, metrics={'train_runtime': 1784.5652, 'train_samples_per_second': 113.219, 'train_steps_per_second': 14.153, 'total_flos': 3103300342435680.0, 'train_loss': 0.1582507510958731, 'epoch': 3.0})

wandb: Waiting for W&B process to finish... (success).
