In [2]:
from transformers import DataCollatorForLanguageModeling, AutoTokenizer, Trainer, TrainingArguments

In [3]:
DATASET_PROC_PATH = "../../data/finetune/tiny/QS-OCR-small-raw-proc"
TEST_SIZE = 0.2
VAL_SIZE = 0.2
SPLIT_SEED = 42069

TOKENIZER_NAME = "albert-base-v2"

PRETRAINED_OUTPUT = "../../experiments/checkpoints/tiny/pretrain/full"
EPOCHS = 1
LOGGING_STEPS = 2
LOGGER_OUTPUT = "../../experiments/logs/tiny/finetune"
SAVE_STEPS = 2
SAVE_LIMIT = 5
TRAINER_OUTPUT = "../../experiments/checkpoints/tiny/finetune"

In [4]:
from datasets import load_from_disk

dataset = load_from_disk(DATASET_PROC_PATH)

In [5]:
from datasets import ClassLabel

unique_labels = sorted(set(dataset["label"]))
print("Unique labels:", unique_labels)

# Define ClassLabel feature
classlabel = ClassLabel(num_classes=len(unique_labels), names=unique_labels)

# Cast the dataset
dataset = dataset.cast_column("label", classlabel)

Unique labels: ['email', 'form', 'letter', 'memo', 'news', 'note', 'report', 'resume', 'scientific']


In [6]:
split_train_test = dataset.train_test_split(test_size=TEST_SIZE, seed=SPLIT_SEED)
dataset_test = split_train_test["test"]

split_train_val = split_train_test["train"].train_test_split(test_size=VAL_SIZE, seed=SPLIT_SEED)
dataset_train = split_train_val["train"]
dataset_val = split_train_val["test"]

print(len(dataset_train), len(dataset_test), len(dataset_val), len(set(dataset_train["label"])))

2080 651 521 9


In [7]:
from heliumbert import HeliumbertForSequenceClassificatio

model = HeliumbertForSequenceClassificatio.from_pretrained(
    PRETRAINED_OUTPUT,
    num_labels=dataset.features["label"].num_classes
)

Some weights of HeliumbertForSequenceClassificatio were not initialized from the model checkpoint at ../../experiments/checkpoints/tiny/pretrain/full and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir=TRAINER_OUTPUT,
    overwrite_output_dir=True,

    num_train_epochs=EPOCHS,
    prediction_loss_only=False,

    per_device_train_batch_size=16,

    logging_strategy="steps",
    logging_steps=LOGGING_STEPS,
    logging_dir=LOGGER_OUTPUT,

    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=SAVE_LIMIT,
    
    max_steps = 10
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()



Step,Training Loss
2,2.2153
4,2.0041
6,1.9948


KeyboardInterrupt: 

In [None]:
trainer.evaluate()



{'eval_loss': 362.9653625488281,
 'eval_runtime': 7.7303,
 'eval_samples_per_second': 4.14,
 'eval_steps_per_second': 0.517,
 'epoch': 1.0}

In [None]:
trainer.save_model(TRAINER_OUTPUT + "/full")

In [14]:
resumed_model = HeliumbertForSequenceClassificatio.from_pretrained(TRAINER_OUTPUT + "/checkpoint-2")

trainer = Trainer(
    model=resumed_model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train(resume_from_checkpoint=True)



Step,Training Loss


KeyboardInterrupt: 