In [1]:
from transformers import DataCollatorForLanguageModeling, AutoTokenizer, Trainer, TrainingArguments

In [2]:
DATASET_PROC_PATH = "../../data/finetune/base/QS-OCR-Large-raw-proc"
TEST_SIZE = 0.2
VAL_SIZE = 0.2
SPLIT_SEED = 42069

TOKENIZER_NAME = "albert-base-v2"

PRETRAINED_OUTPUT = "../../experiments/checkpoints/base/pretrain/full"
EPOCHS = 1
LOGGING_STEPS = 2
LOGGER_OUTPUT = "../../experiments/logs/base/finetune"
SAVE_STEPS = 2
SAVE_LIMIT = 5
TRAINER_OUTPUT = "../../experiments/checkpoints/base/finetune"

In [3]:
from datasets import load_from_disk

dataset = load_from_disk(DATASET_PROC_PATH)

In [5]:
dataset_test = dataset["test"]
dataset_train = dataset["train"]
dataset_val = dataset["validation"]

print(len(dataset_train), len(dataset_test), len(dataset_val))

308026 38520 38498


In [6]:
from heliumbert import HeliumbertForSequenceClassification

model = HeliumbertForSequenceClassification.from_pretrained(
    PRETRAINED_OUTPUT,
    num_labels=dataset["train"].features["label"].num_classes
)

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../../experiments/checkpoints/base/pretrain/full'. Use `repo_type` argument if needed.

In [8]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir=TRAINER_OUTPUT,
    overwrite_output_dir=True,

    num_train_epochs=EPOCHS,
    prediction_loss_only=False,

    per_device_train_batch_size=16,

    logging_strategy="steps",
    logging_steps=LOGGING_STEPS,
    logging_dir=LOGGER_OUTPUT,

    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=SAVE_LIMIT,
    
    max_steps = 10
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()



Step,Training Loss
2,2.2153
4,2.0041
6,1.9948


KeyboardInterrupt: 

In [None]:
trainer.evaluate()



{'eval_loss': 362.9653625488281,
 'eval_runtime': 7.7303,
 'eval_samples_per_second': 4.14,
 'eval_steps_per_second': 0.517,
 'epoch': 1.0}

In [None]:
trainer.save_model(TRAINER_OUTPUT + "/full")

In [14]:
resumed_model = HeliumbertForSequenceClassificatio.from_pretrained(TRAINER_OUTPUT + "/checkpoint-2")

trainer = Trainer(
    model=resumed_model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train(resume_from_checkpoint=True)



Step,Training Loss


KeyboardInterrupt: 