# PART II exercise

## Fine-tuning BERT

## BERT for sequence classification


In [6]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, \
     DataCollatorWithPadding, pipeline
from datasets import Dataset
import numpy as np
# import evaluate

In [8]:
snips_file = open('./data/snips.train.txt', 'rb')

snips_rows = snips_file.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [12]:
# This code segment parses the snips dataset into a more manageable format

utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # skip over rows with no data
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)
    

In [13]:
len(labels_for_tokens), len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [14]:
utterances[0], sequence_labels[0]

('listen to westbam alumb allergic on google music', 'PlayMusic')

In [15]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [16]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['AddToPlaylist',
 'SearchCreativeWork',
 'BookRestaurant',
 'GetWeather',
 'SearchScreeningEvent',
 'RateBook',
 'PlayMusic']

In [17]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]

print(f'There are {len(unique_sequence_labels)} unique sequence labels')

There are 7 unique sequence labels


In [18]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f'There are {len(unique_token_labels)} unique token labels')

There are 72 unique token labels


In [19]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print([unique_token_labels[l] for l in labels_for_tokens[0]])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[67, 67, 25, 67, 28, 67, 20, 9]
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
6
PlayMusic


In [20]:
Dataset

datasets.arrow_dataset.Dataset

In [21]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances, 
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)

snips_dataset = snips_dataset.train_test_split(test_size=0.2)



In [None]:
snips_dataset

DatasetDict({
    train: Dataset({
        features: ['utterance', 'label', 'tokens', 'token_labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['utterance', 'label', 'tokens', 'token_labels'],
        num_rows: 2617
    })
})

In [22]:
unique_sequence_labels[6]

'PlayMusic'

In [23]:
snips_dataset['train'][0]

{'utterance': 'what will the weather be like in lambertville india at ten pm',
 'label': 3,
 'tokens': ['what',
  'will',
  'the',
  'weather',
  'be',
  'like',
  'in',
  'lambertville',
  'india',
  'at',
  'ten',
  'pm'],
 'token_labels': [67, 67, 67, 67, 67, 67, 67, 7, 48, 67, 42, 29]}

In [24]:
tokenizer('hi')

{'input_ids': [101, 20844, 102], 'attention_mask': [1, 1, 1]}

In [25]:
tokenizer.decode([101, 2603, 1142, 18977, 126, 2940, 102])

'[CLS] rate this textbook 5 stars [SEP]'

In [26]:
# simple function to batch tokenize utterances with truncation
# TODO
def preprocess_function(examples):
    # return tokenizer(examples["utterance"], truncation=True)

In [27]:
# TODO mapping the preprocessing function
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [28]:
# only input_ids, attention_mask, and label are used. The rest are for show
seq_clf_tokenized_snips['train'][0]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
  1184,
  1209,
  1103,
  4250,
  1129,
  1176,
  1107,
  2495,
  10615,
  1204,
  2138,
  1107,
  7168,
  1120,
  1995,
  9852,
  102],
 'label': 3,
 'token_labels': [67, 67, 67, 67, 67, 67, 67, 7, 48, 67, 42, 29],
 'tokens': ['what',
  'will',
  'the',
  'weather',
  'be',
  'like',
  'in',
  'lambertville',
  'india',
  'at',
  'ten',
  'pm'],
 'utterance': 'what will the weather be like in lambertville india at ten pm'}

In [29]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.

# TODO create datacollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [30]:
# Data Collator will pad data so that all examples are the same input length.
#  Attention mask is how we ignore attention scores for padding tokens

In [32]:
{i: l for i, l in enumerate(unique_sequence_labels)}

{0: 'AddToPlaylist',
 1: 'SearchCreativeWork',
 2: 'BookRestaurant',
 3: 'GetWeather',
 4: 'SearchScreeningEvent',
 5: 'RateBook',
 6: 'PlayMusic'}

In [33]:
# TODO call the pretrained 'distilbert-base-cased' model
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased', 
    num_labels=len(unique_sequence_labels),
)

# set an index -> label dictionary
sequence_clf_model.config.id2label = 
# sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier

In [34]:
sequence_clf_model.config.id2label[0]

'AddToPlaylist'

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):  # custom method to take in logits and calculate accuracy of the eval set
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [None]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    # some deep learning parameters that the Trainer is able to take in
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,  # optional
    data_collator=data_collator
)

In [None]:
# Get initial metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.954935908317566,
 'eval_accuracy': 0.1738632021398548,
 'eval_runtime': 10.1366,
 'eval_samples_per_second': 258.173,
 'eval_steps_per_second': 8.089}

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4382,0.158161,0.978601
2,0.0169,0.050191,0.987772


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Sa

TrainOutput(global_step=656, training_loss=0.6899719095402738, metrics={'train_runtime': 309.9466, 'train_samples_per_second': 67.541, 'train_steps_per_second': 2.116, 'total_flos': 131497001349432.0, 'train_loss': 0.6899719095402738, 'epoch': 2.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.050191015005111694,
 'eval_accuracy': 0.9877722583110432,
 'eval_runtime': 9.8932,
 'eval_samples_per_second': 264.526,
 'eval_steps_per_second': 8.289,
 'epoch': 2.0}

In [None]:
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results/config.json
Model weights saved in ./snips_clf/results/pytorch_model.bin


In [None]:
pipeline

<function transformers.pipelines.pipeline(task: str = None, model: Optional = None, config: Union[str, transformers.configuration_utils.PretrainedConfig, NoneType] = None, tokenizer: Union[str, transformers.tokenization_utils.PreTrainedTokenizer, transformers.tokenization_utils_fast.PreTrainedTokenizerFast, NoneType] = None, feature_extractor: Union[str, ForwardRef('SequenceFeatureExtractor'), NoneType] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, use_auth_token: Union[str, bool, NoneType] = None, device: Union[int, str, ForwardRef('torch.device'), NoneType] = None, device_map=None, torch_dtype=None, trust_remote_code: Optional[bool] = None, model_kwargs: Dict[str, Any] = None, pipeline_class: Optional[Any] = None, **kwargs) -> transformers.pipelines.base.Pipeline>

In [None]:
# We can now load our fine-tuned from our directory
pipe = pipeline("text-classification", "./snips_clf/results", tokenizer=tokenizer)

pipe('Please add Here We Go by Dispatch to my road trip playlist')


loading configuration file ./snips_clf/results/config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "SearchScreeningEvent",
    "1": "SearchCreativeWork",
    "2": "PlayMusic",
    "3": "BookRestaurant",
    "4": "GetWeather",
    "5": "AddToPlaylist",
    "6": "RateBook"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype":

[{'label': 'AddToPlaylist', 'score': 0.9931938052177429}]

In [None]:
frozen_sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased', 
    num_labels=len(unique_sequence_labels),
)

loading configuration file config.json from cache at /Users/sinanozdemir/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 28996
}

loading weights file pytorc

In [None]:
# freezes EVERY parameter in our bert model
# does not freeze our classification layer
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False

In [None]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    # some deep learning parameters that the Trainer is able to take in
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=frozen_sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.9507232904434204,
 'eval_accuracy': 0.15208253725640045,
 'eval_runtime': 9.8112,
 'eval_samples_per_second': 266.736,
 'eval_steps_per_second': 8.358}

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656


Epoch,Training Loss,Validation Loss,Accuracy
1,1.9553,1.889053,0.439817
2,1.7222,1.675033,0.818877


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Sa

TrainOutput(global_step=656, training_loss=1.8701613674440034, metrics={'train_runtime': 115.0559, 'train_samples_per_second': 181.946, 'train_steps_per_second': 5.702, 'total_flos': 131497001349432.0, 'train_loss': 1.8701613674440034, 'epoch': 2.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.675033450126648,
 'eval_accuracy': 0.8188765762323271,
 'eval_runtime': 9.7586,
 'eval_samples_per_second': 268.174,
 'eval_steps_per_second': 8.403,
 'epoch': 2.0}