In [79]:
from transformers import DataCollatorForLanguageModeling, AutoTokenizer, Trainer, TrainingArguments

In [80]:
DATASET_PROC_PATH = "../../data/pretrain/tiny/wikitext-2-v1-tiny-proc"
TEST_SIZE = 0.2
VAL_SIZE = 0.2
SPLIT_SEED = 42069

TOKENIZER_NAME = "albert-base-v2"
MLM_PROBABILITY = 0.15

TRAINER_OUTPUT = "../../experiments/checkpoints/tiny/pretrain"
EPOCHS = 1
LOGGING_STEPS = 2
LOGGER_OUTPUT = "../../experiments/logs/tiny/pretrain"
SAVE_STEPS = 2
SAVE_LIMIT = 5

In [81]:
from transformers import BertForMaskedLM

MODEL = BertForMaskedLM

In [82]:
from datasets import load_from_disk

dataset = load_from_disk(DATASET_PROC_PATH)

In [83]:
split_train_test = dataset.train_test_split(test_size=TEST_SIZE, seed=SPLIT_SEED)
dataset_test = split_train_test["test"]

split_train_val = split_train_test["train"].train_test_split(test_size=VAL_SIZE, seed=SPLIT_SEED)
dataset_train = split_train_val["train"]
dataset_val = split_train_val["test"]

print(len(dataset_train), len(dataset_test), len(dataset_val))

128 40 32


In [84]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True, mlm_probability=MLM_PROBABILITY
)

In [85]:
model = MODEL.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [86]:
training_args = TrainingArguments(
    output_dir=TRAINER_OUTPUT,
    overwrite_output_dir=True,

    num_train_epochs=EPOCHS,
    prediction_loss_only=False,

    per_device_train_batch_size=16,

    logging_strategy="steps",
    logging_steps=LOGGING_STEPS,
    logging_dir=LOGGER_OUTPUT,

    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=SAVE_LIMIT,
)

In [87]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    data_collator=data_collator
)

In [88]:
trainer.train()

Step,Training Loss
2,12.0874
4,9.0713
6,8.94
8,8.7123


TrainOutput(global_step=8, training_loss=9.702746391296387, metrics={'train_runtime': 330.8131, 'train_samples_per_second': 0.387, 'train_steps_per_second': 0.024, 'total_flos': 8422554009600.0, 'train_loss': 9.702746391296387, 'epoch': 1.0})

In [89]:
trainer.evaluate()



{'eval_loss': 8.667927742004395,
 'eval_runtime': 5.7043,
 'eval_samples_per_second': 5.61,
 'eval_steps_per_second': 0.701,
 'epoch': 1.0}