In [1]:
import os
from random import shuffle

import evaluate
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, \
    TrainingArguments, Trainer




In [2]:
CLASSES = {
    'yes': 0,
    'irrelevant': 1,
    'no': 2,
}
STORY_FILE = 'dataset/story.txt'
DATASET_PATH = 'dataset/'
MODEL_NAME = "cross-encoder/nli-deberta-v3-base"
BATCH_SIZE = 8
EPOCHS = 6
LEARNING_RATE = 2e-5
MAX_LENGTH = 512
DEVICE = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

device(type='cuda')

Here we define the tokenizer and the model using the handy `transformer` library from *HuggingFace*.

In [3]:
story = open(STORY_FILE).read().replace("\n\n", "\n").replace("\n", " ").strip()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model = model.to(DEVICE)
pass



Next, we load the data set and split it into training and test sets.

In [7]:
dataset: list[dict] = []
for file in CLASSES.keys():
    with open(os.path.join(DATASET_PATH, f'{file}.txt')) as f:
        lines = f.readlines()
        print(f'Read {len(lines)} "{file}" questions')
        dataset.extend(map(lambda e: {'question': e.replace(
            '\n', '').strip(), 'answer': CLASSES[file]}, lines))

shuffle(dataset)


def preprocess(sample):
    inputs = tokenizer(
        story,
        sample["question"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs["label"] = sample["answer"]
    return inputs


hf_dataset = Dataset.from_list(dataset)
tokenized_dataset = hf_dataset.map(preprocess, remove_columns=["question"])

split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

Read 650 "yes" questions
Read 650 "irrelevant" questions
Read 650 "no" questions


Map:   0%|          | 0/1950 [00:00<?, ? examples/s]

For Ġ, look at https://discuss.huggingface.co/t/bpe-tokenizers-and-spaces-before-words/475

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    report_to='none',
    eval_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    eval_accumulation_steps=10,
    disable_tqdm=False
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [21]:
model.save_pretrained("deberta_seagull_train_0.1_test_1.5")
tokenizer.save_pretrained("deberta_seagull_train_0.1_test_1.5")

('deberta_seagull_train_0.1_test_1.5\\tokenizer_config.json',
 'deberta_seagull_train_0.1_test_1.5\\special_tokens_map.json',
 'deberta_seagull_train_0.1_test_1.5\\spm.model',
 'deberta_seagull_train_0.1_test_1.5\\added_tokens.json',
 'deberta_seagull_train_0.1_test_1.5\\tokenizer.json')

In [29]:
# results = trainer.evaluate()
# print(results)

# For prediction
sample_questions = ["Albert is a salesman"]
inputs = tokenizer([story] * len(sample_questions),sample_questions, truncation=True, padding=True, return_tensors="pt").to(DEVICE)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
print(predictions)

tensor([1], device='cuda:0')
