In [11]:
import os
import time
from random import shuffle

import numpy as np
import torch
from torch.optim import AdamW
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup, AutoModelForSequenceClassification, AutoTokenizer, \
    TrainingArguments, Trainer

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [4]:
CLASSES = {
    'yes': 0,
    'irrelevant': 1,
    'no': 2,
}
STORY_FILE = 'dataset/story.txt'
DATASET_PATH = 'dataset/'
MODEL_NAME = "microsoft/deberta-v3-base"
BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_LENGTH = 512
DEVICE = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

device(type='cuda')

Here we define the tokenizer and the model using the handy `transformer` library from *HuggingFace*.

In [5]:
story = open(STORY_FILE).read().replace("\n\n", "\n").replace("\n", " ").strip()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3, output_attentions=False,
                                                           output_hidden_states=False)
model = model.to(DEVICE)
pass

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Next, we load the data set and split it into training and test sets.

In [9]:
from datasets import Dataset

dataset: list[dict] = []
for file in CLASSES.keys():
    with open(os.path.join(DATASET_PATH, f'{file}.txt')) as f:
        lines = set(f.readlines())
        print(f'Read {len(lines)} "{file}" questions')
        dataset.extend(map(lambda e: {'question': e.replace(
            '\n', '').strip(), 'answer': CLASSES[file]}, lines))

shuffle(dataset)


def preprocess(sample):
    inputs = tokenizer(
        sample["question"],
        story,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs["label"] = sample["answer"]
    return inputs


hf_dataset = Dataset.from_list(dataset)
tokenized_dataset = hf_dataset.map(preprocess, remove_columns=["question"])

split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

Read 205 "yes" questions
Read 223 "irrelevant" questions
Read 243 "no" questions


Map:   0%|          | 0/671 [00:00<?, ? examples/s]

For Ġ, look at https://discuss.huggingface.co/t/bpe-tokenizers-and-spaces-before-words/475

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    eval_accumulation_steps=10,
    disable_tqdm=False
)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=flat_accuracy
)

NameError: name 'TrainingArguments' is not defined

In [7]:
trainer.train()

In [None]:
model.save_pretrained("deberta_seagull")
tokenizer.save_pretrained("deberta_tokenizer_seagull")

In [None]:
results = trainer.evaluate()
print(results)

# For prediction
sample_questions = ["Is the protagonist brave?", "Does the story involve a mystery?"]
inputs = tokenizer(sample_questions, [story] * len(sample_questions), truncation=True, padding=True, return_tensors="pt")
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
print(predictions)