# The Seagull Story

In [1]:
import json
import os
# import torch
# from torch.utils.data import Dataset, DataLoader
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


## Fine-tuning DeBERTa

In [3]:
CLASSES = {
    'yes': 0,
    'irrelevant': 1,
    'no': 2,
}
STORY_FILE = 'dataset/story.txt'
DATASET_PATH = 'dataset/'
MODEL_NAME = "microsoft/deberta-base"
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_LENGTH = 512
# DEVICE = torch.device(
#     "cuda") if torch.cuda.is_available() else torch.device("cpu")

Here we define the tokenizer and the model using the handy `transformer` library from *HuggingFace*.

In [5]:
story = open(STORY_FILE).read().replace("\n\n", "\n").replace("\n", " ")
tokenizer = DebertaTokenizer.from_pretrained(MODEL_NAME)
# model = DebertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
# model.to(DEVICE)

In [None]:
tokenizer('story', 'question', truncation=True, max_length=8, padding="max_length")

{'input_ids': [1, 6462, 2, 40018, 2, 0, 0, 0], 'token_type_ids': [0, 0, 0, 1, 1, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0]}

In [None]:
class NLIDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        passage = story
        question = item["question"]
        label = item["answer"]

        inputs = self.tokenizer(
            story, question, truncation=True, max_length=MAX_LENGTH, padding="max_length", return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

Next, we load the data set and split it into training and test sets.

In [None]:
dataset: list[dict] = []
for file in CLASSES.keys():
    with open(os.path.join(DATASET_PATH, '{file}.txt')) as f:
        lines = set(f.readlines())
        print(f'Read {len(lines)} "{file}" questions')
        dataset.extend(map(lambda e: {'question': e.replace(
            '\n', ''), 'answer': CLASSES[file]}, lines))
        
trainset, testset = train_test_split(dataset, test_size=0.1, random_state=42)
train_dataset = NLIDataset(trainset, tokenizer)
val_dataset = NLIDataset(testset, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

Read 232 "yes" questions
Read 112 "irrelevant" questions
Read 116 "no" questions


In [None]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

406

In [None]:
def train_one_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy

In [None]:
def test(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy

Main training loop.

In [None]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, scheduler, DEVICE)
    val_loss, val_acc = test(model, val_loader, DEVICE)

    print(f"\nTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

Save the model.

In [None]:
model.save_pretrained("nli-deberta-model")
tokenizer.save_pretrained("nli-deberta-model")