# The Seagull Story

In [53]:
import os
import random
from random import shuffle

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW, get_scheduler

## Fine-tuning DeBERTa

In [3]:
CLASSES = {
    'yes': 0,
    'irrelevant': 1,
    'no': 2,
}
STORY_FILE = 'dataset/story.txt'
DATASET_PATH = 'dataset/'
MODEL_NAME = "microsoft/deberta-base"
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_LENGTH = 512
DEVICE = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

device(type='cuda')

Here we define the tokenizer and the model using the handy `transformer` library from *HuggingFace*.

In [26]:
story = open(STORY_FILE).read().replace("\n\n", "\n").replace("\n", " ")
tokenizer = DebertaTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
model = DebertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.to(DEVICE)
pass

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
class NLIDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def print(self):
        index = random.randint(0, len(self.data) - 1)
        encoded = self[index]
        print(encoded["input_ids"].shape)
        tokens = tokenizer.decode(encoded["input_ids"])
        token_ids = [i.numpy() for i in encoded["input_ids"]]
        attention = [i.numpy() for i in encoded["attention_masks"]]

        table = np.array([tokens, token_ids, attention]).T
        print(self.data[index])
        print(tabulate(table,
                       headers=['Tokens', 'Token IDs', 'Attention Mask'],
                       tablefmt='fancy_grid'))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["question"]
        label = item["answer"]

        encoded_dict = tokenizer.encode_plus(
            'story', question,
            truncation=True,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            "input_ids": encoded_dict["input_ids"],
            "token_type_ids": encoded_dict["token_type_ids"].unsqueeze(0),
            "attention_mask": encoded_dict["attention_mask"].unsqueeze(0),
            "label": torch.tensor(label)
        }

Next, we load the data set and split it into training and test sets.

In [66]:
NLIDataset(train_set).print()

torch.Size([1, 512])


TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [50]:
dataset: list[dict] = []
for file in CLASSES.keys():
    with open(os.path.join(DATASET_PATH, f'{file}.txt')) as f:
        lines = set(f.readlines())
        print(f'Read {len(lines)} "{file}" questions')
        dataset.extend(map(lambda e: {'question': e.replace(
            '\n', ''), 'answer': CLASSES[file]}, lines))

shuffle(dataset)
train_set, test_set = train_test_split(dataset, test_size=0.15, random_state=42)

train_loader = DataLoader(NLIDataset(train_set), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(NLIDataset(test_set), batch_size=BATCH_SIZE, shuffle=False)

Read 205 "yes" questions
Read 223 "irrelevant" questions
Read 243 "no" questions


In [None]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

406

In [None]:
def train_one_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy

In [None]:
def test(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy

Main training loop.

In [None]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, scheduler, DEVICE)
    val_loss, val_acc = test(model, val_loader, DEVICE)

    print(f"\nTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

Save the model.

In [None]:
model.save_pretrained("nli-deberta-model")
tokenizer.save_pretrained("nli-deberta-model")