# The Seagull Story

In [1]:
import os
import random
import time
from random import shuffle

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import DebertaTokenizer, DebertaForSequenceClassification, get_linear_schedule_with_warmup, AutoModel, \
    AutoTokenizer, DebertaV2ForSequenceClassification, DebertaV2Tokenizer

## Fine-tuning DeBERTa

In [2]:
CLASSES = {
    'yes': 0,
    'irrelevant': 1,
    'no': 2,
}
STORY_FILE = 'dataset/story.txt'
DATASET_PATH = 'dataset/'
MODEL_NAME = "microsoft/deberta-v3-base"
BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_LENGTH = 512
DEVICE = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

device(type='cuda')

Here we define the tokenizer and the model using the handy `transformer` library from *HuggingFace*.

In [3]:
story = open(STORY_FILE).read().replace("\n\n", "\n").replace("\n", " ")
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3, output_attentions=False,
                                                         output_hidden_states=False)
model = model.to(DEVICE)
pass

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
class NLIDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def print(self):
        index = random.randint(0, len(self.data) - 1)
        encoded = self[index]
        tokens = tokenizer.tokenize(tokenizer.decode(encoded["input_ids"]))
        token_ids = encoded["input_ids"].numpy()
        attention = encoded["attention_mask"].numpy()

        table = np.array([tokens, token_ids, attention]).T
        print(self.data[index])
        print(tabulate(table,
                       headers=['Tokens', 'Token IDs', 'Attention Mask'],
                       tablefmt='fancy_grid'))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["question"]
        label = item["answer"]

        encoded_dict = tokenizer.encode_plus(
            story, question,
            truncation=True,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            "input_ids": encoded_dict["input_ids"].squeeze(),
            "token_type_ids": encoded_dict["token_type_ids"].squeeze(),
            "attention_mask": encoded_dict["attention_mask"].squeeze(),
            "label": torch.tensor(label)
        }

Next, we load the data set and split it into training and test sets.

In [5]:
dataset: list[dict] = []
for file in CLASSES.keys():
    with open(os.path.join(DATASET_PATH, f'{file}.txt')) as f:
        lines = set(f.readlines())
        print(f'Read {len(lines)} "{file}" questions')
        dataset.extend(map(lambda e: {'question': e.replace(
            '\n', ''), 'answer': CLASSES[file]}, lines))

shuffle(dataset)
train_set, test_set = train_test_split(dataset, test_size=0.15, random_state=42)

train_loader = DataLoader(NLIDataset(train_set), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(NLIDataset(test_set), batch_size=BATCH_SIZE, shuffle=False)

Read 205 "yes" questions
Read 223 "irrelevant" questions
Read 243 "no" questions


For Ġ, look at https://discuss.huggingface.co/t/bpe-tokenizers-and-spaces-before-words/475

In [6]:
NLIDataset(train_set).print()

{'question': 'Albert is afraid of heights', 'answer': 1}
╒═══════════════╤═════════════╤══════════════════╕
│ Tokens        │   Token IDs │   Attention Mask │
╞═══════════════╪═════════════╪══════════════════╡
│ [CLS]         │           1 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ While         │        5771 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ Ġenjoying     │        6218 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ Ġa            │          10 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ Ġpeaceful     │        7053 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ Ġboat         │        4293 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ Ġride         │        3068 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ ,             │           6 │                1 │
├───────────────┼────────

In [7]:

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = len(train_loader) * EPOCHS
# scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,  # Default value in run_glue.py
                                            num_training_steps=num_training_steps)

In [8]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


assert (flat_accuracy(np.array([[.7, .2, .1], [.1, .8, .1]]), np.array([[2, 1]])) == 0.5)

In [9]:
training_stats = []
total_t0 = time.time()

for epoch_i in range(0, EPOCHS):

    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training
    model.train()
    for  batch in tqdm(train_loader):
        b_input_ids = batch["input_ids"].to(DEVICE)
        b_token_type_ids = batch["token_type_ids"].to(DEVICE)
        b_attention_mask = batch["attention_mask"].to(DEVICE)
        b_labels = batch["label"].to(DEVICE)
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs"
        model.zero_grad()
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        output = model(b_input_ids,
                       token_type_ids=b_token_type_ids,
                       attention_mask=b_attention_mask,
                       labels=b_labels)
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value;
        loss = output.loss
        logits = output.logits
        total_train_loss += loss.item()
        print(np.argmax(logits, axis=1).flatten(), b_labels, flat_accuracy(logits, b_labels))
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_loader)

    # Measure how long this epoch took.
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:} sec".format((time.time() - t0) ))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.

    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    best_eval_accuracy = np.Inf
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in tqdm(val_loader):
        b_input_ids = batch["input_ids"].to(DEVICE)
        b_token_type_ids = batch["token_type_ids"].to(DEVICE)
        b_attention_mask = batch["attention_mask"].to(DEVICE)
        b_labels = batch["label"].to(DEVICE)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            output = model(b_input_ids,
                           token_type_ids=b_token_type_ids,
                           attention_mask=b_attention_mask,
                           labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()

        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_loader)

    #save the best model
    if avg_val_accuracy < best_eval_accuracy:
        print('Saving...', avg_val_accuracy)
        torch.save(model, f'{MODEL_NAME}_seagull_story')
        best_eval_accuracy = avg_val_accuracy

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:} sec".format((time.time() - t0)))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format((time.time() - total_t0) ))


Training...


2it [01:21, 40.63s/it]


KeyboardInterrupt: 

Save the model.

In [None]:
model.save_pretrained("deberta_seagull")
tokenizer.save_pretrained("deberta_tokenizer_seagull")