# Train a Transformer Model from Scratch

source: https://huggingface.co/blog/how-to-train

## Housekeeping

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Deep Space 9 Scripts

In [2]:
from pathlib import Path
PATH_DATA = 'ds9_data/'
paths = [str(x) for x in Path(PATH_DATA).glob("**/*.txt")]

In [3]:
import re
with open(PATH_DATA + "all_scripts.txt", "w") as outfile:
    for filename in paths:
        with open(filename) as infile:
            contents = infile.read()
            contents = re.sub("\t",' ',contents)
            contents = re.sub(' +',' ',contents)
            contents = re.sub('\n \n','\n',contents)
            contents = re.sub('\n+','\n',contents)
            outfile.write(contents)

## Train Tokenizer

In [4]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [5]:
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

PATH_MODEL = 'ds9_scratch'
tokenizer.save_model(PATH_MODEL)






['ds9_scratch/vocab.json', 'ds9_scratch/merges.txt']

## Check GPU

In [6]:
!nvidia-smi

Wed Sep 28 06:26:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    32W /  70W |      0MiB / 15360MiB |     10%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
import torch
torch.cuda.is_available()

True

## Train Transformer

### Setup

In [8]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [9]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(PATH_MODEL, max_len=512)

In [10]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [11]:
model.num_parameters()

83504416

In [12]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=PATH_DATA + 'all_scripts.txt',
    block_size=128,
)

CPU times: user 22.3 s, sys: 1.21 s, total: 23.5 s
Wall time: 23.1 s


In [13]:
print(dataset[301])
tokenizer.decode(dataset[301]["input_ids"])

{'input_ids': tensor([   0,  912,   16,  315, 7944,  284, 3686,  271, 3907,  225,    2])}


'<s> Rom, you forgot to include the profit </s>'

In [14]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Train for one epoch

In [23]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=os.path.join(PATH_MODEL, "transformer"),
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=1000,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [24]:
# %%time
# trainer.train()

***** Running training *****
  Num examples = 388834
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 6076


Step,Training Loss
500,6.7023
1000,5.6158
1500,5.1802
2000,4.8883
2500,4.6897
3000,4.5567
3500,4.4351
4000,4.3003
4500,4.2566
5000,4.2283


Saving model checkpoint to ds9_scratch/transformer/checkpoint-1000
Configuration saved in ds9_scratch/transformer/checkpoint-1000/config.json
Model weights saved in ds9_scratch/transformer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ds9_scratch/transformer/checkpoint-2000
Configuration saved in ds9_scratch/transformer/checkpoint-2000/config.json
Model weights saved in ds9_scratch/transformer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ds9_scratch/transformer/checkpoint-3000
Configuration saved in ds9_scratch/transformer/checkpoint-3000/config.json
Model weights saved in ds9_scratch/transformer/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to ds9_scratch/transformer/checkpoint-4000
Configuration saved in ds9_scratch/transformer/checkpoint-4000/config.json
Model weights saved in ds9_scratch/transformer/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to ds9_scratch/transformer/checkpoint-5000
Configuration saved in ds9_scratch/transf

CPU times: user 18min 40s, sys: 6min 43s, total: 25min 23s
Wall time: 25min 24s


TrainOutput(global_step=6076, training_loss=4.755729022349395, metrics={'train_runtime': 1524.2226, 'train_samples_per_second': 255.103, 'train_steps_per_second': 3.986, 'total_flos': 2276569665816576.0, 'train_loss': 4.755729022349395, 'epoch': 1.0})

In [25]:
trainer.save_model(os.path.join(PATH_MODEL, "epoch_1"))

Saving model checkpoint to ds9_scratch/epoch_1
Configuration saved in ds9_scratch/epoch_1/config.json
Model weights saved in ds9_scratch/epoch_1/pytorch_model.bin


### Test model after one epoch

In [46]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=os.path.join(PATH_MODEL, "epoch_1"),
    tokenizer=os.path.join(PATH_MODEL, "epoch_1")
)

loading configuration file ds9_scratch/epoch_1/config.json
Model config RobertaConfig {
  "_name_or_path": "ds9_scratch/epoch_1",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ds9_scratch/epoch_1/config.json
Model config RobertaConfig {
  "_name_or_path": "ds9_scratch/epoch_1",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id

In [47]:
fill_mask("He is drinking a cup of <mask>.")

[{'score': 0.022676831111311913,
  'token': 355,
  'token_str': ' it',
  'sequence': 'He is drinking a cup of it.'},
 {'score': 0.017718225717544556,
  'token': 578,
  'token_str': ' them',
  'sequence': 'He is drinking a cup of them.'},
 {'score': 0.013246491551399231,
  'token': 434,
  'token_str': ' him',
  'sequence': 'He is drinking a cup of him.'},
 {'score': 0.008509969338774681,
  'token': 643,
  'token_str': ' us',
  'sequence': 'He is drinking a cup of us.'},
 {'score': 0.0075829545967280865,
  'token': 468,
  'token_str': ' this',
  'sequence': 'He is drinking a cup of this.'}]

In [48]:
fill_mask("Some beeps and the image of Gul <mask> appears on the monitors")

[{'score': 0.18649062514305115,
  'token': 16,
  'token_str': ',',
  'sequence': 'Some beeps and the image of Gul, appears on the monitors'},
 {'score': 0.07210240513086319,
  'token': 321,
  'token_str': ' and',
  'sequence': 'Some beeps and the image of Gul and appears on the monitors'},
 {'score': 0.03821699693799019,
  'token': 349,
  'token_str': ' is',
  'sequence': 'Some beeps and the image of Gul is appears on the monitors'},
 {'score': 0.018567854538559914,
  'token': 318,
  'token_str': "'s",
  'sequence': "Some beeps and the image of Gul's appears on the monitors"},
 {'score': 0.010578799061477184,
  'token': 284,
  'token_str': ' to',
  'sequence': 'Some beeps and the image of Gul to appears on the monitors'}]

In [49]:
fill_mask("Two to <mask> up.")

[{'score': 0.030569445341825485,
  'token': 557,
  'token_str': ' get',
  'sequence': 'Two to get up.'},
 {'score': 0.026128537952899933,
  'token': 329,
  'token_str': ' be',
  'sequence': 'Two to be up.'},
 {'score': 0.02502203918993473,
  'token': 448,
  'token_str': ' go',
  'sequence': 'Two to go up.'},
 {'score': 0.02097230590879917,
  'token': 376,
  'token_str': ' do',
  'sequence': 'Two to do up.'},
 {'score': 0.020337576046586037,
  'token': 591,
  'token_str': ' see',
  'sequence': 'Two to see up.'}]

In [51]:
fill_mask("Commander <mask> is sitting behind his desk.")

[{'score': 0.061355382204055786,
  'token': 473,
  'token_str': ' Sisko',
  'sequence': 'Commander Sisko is sitting behind his desk.'},
 {'score': 0.02937745302915573,
  'token': 588,
  'token_str': ' Bashir',
  'sequence': 'Commander Bashir is sitting behind his desk.'},
 {'score': 0.022974304854869843,
  'token': 355,
  'token_str': ' it',
  'sequence': 'Commander it is sitting behind his desk.'},
 {'score': 0.022798290476202965,
  'token': 564,
  'token_str': ' Odo',
  'sequence': 'Commander Odo is sitting behind his desk.'},
 {'score': 0.022759979590773582,
  'token': 317,
  'token_str': ' he',
  'sequence': 'Commander he is sitting behind his desk.'}]

In [21]:
fill_mask("The Ferengis follow the Rules of <mask>.")

[{'score': 0.03109504096210003,
  'token': 355,
  'token_str': ' it',
  'sequence': 'The Ferengis follow the Rules of it.'},
 {'score': 0.029616249725222588,
  'token': 578,
  'token_str': ' them',
  'sequence': 'The Ferengis follow the Rules of them.'},
 {'score': 0.022989286109805107,
  'token': 434,
  'token_str': ' him',
  'sequence': 'The Ferengis follow the Rules of him.'},
 {'score': 0.014342641457915306,
  'token': 418,
  'token_str': ' me',
  'sequence': 'The Ferengis follow the Rules of me.'},
 {'score': 0.012819110415875912,
  'token': 314,
  'token_str': ' you',
  'sequence': 'The Ferengis follow the Rules of you.'}]

In [22]:
fill_mask("In the Bajoran religion, the <mask> is worshipped as the Celestial Temple of the Prophets")

[{'score': 0.020489269867539406,
  'token': 792,
  'token_str': ' door',
  'sequence': 'In the Bajoran religion, the door is worshipped as the Celestial Temple of the Prophets'},
 {'score': 0.016336755827069283,
  'token': 982,
  'token_str': ' room',
  'sequence': 'In the Bajoran religion, the room is worshipped as the Celestial Temple of the Prophets'},
 {'score': 0.011151541955769062,
  'token': 830,
  'token_str': ' station',
  'sequence': 'In the Bajoran religion, the station is worshipped as the Celestial Temple of the Prophets'},
 {'score': 0.011117598041892052,
  'token': 752,
  'token_str': ' ship',
  'sequence': 'In the Bajoran religion, the ship is worshipped as the Celestial Temple of the Prophets'},
 {'score': 0.0093994140625,
  'token': 1173,
  'token_str': ' bar',
  'sequence': 'In the Bajoran religion, the bar is worshipped as the Celestial Temple of the Prophets'}]

### Train for 4 more epochs

In [41]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=os.path.join(PATH_MODEL, "transformer"),
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=64,
    save_steps=1000,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [42]:
# %%time
# trainer.train(os.path.join(PATH_MODEL, "transformer", "checkpoint-6000"))

Loading model from ds9_scratch/transformer/checkpoint-6000.
***** Running training *****
  Num examples = 388834
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 30380
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 6000
  Will skip the first 0 epochs then the first 6000 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/6000 [00:00<?, ?it/s]

Step,Training Loss
6500,4.1857
7000,4.0597
7500,4.0097
8000,3.9526
8500,3.9137
9000,3.8545
9500,3.7929
10000,3.7932
10500,3.783
11000,3.6872


Saving model checkpoint to ds9_scratch/transformer/checkpoint-7000
Configuration saved in ds9_scratch/transformer/checkpoint-7000/config.json
Model weights saved in ds9_scratch/transformer/checkpoint-7000/pytorch_model.bin
Saving model checkpoint to ds9_scratch/transformer/checkpoint-8000
Configuration saved in ds9_scratch/transformer/checkpoint-8000/config.json
Model weights saved in ds9_scratch/transformer/checkpoint-8000/pytorch_model.bin
Saving model checkpoint to ds9_scratch/transformer/checkpoint-9000
Configuration saved in ds9_scratch/transformer/checkpoint-9000/config.json
Model weights saved in ds9_scratch/transformer/checkpoint-9000/pytorch_model.bin
Saving model checkpoint to ds9_scratch/transformer/checkpoint-10000
Configuration saved in ds9_scratch/transformer/checkpoint-10000/config.json
Model weights saved in ds9_scratch/transformer/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to ds9_scratch/transformer/checkpoint-11000
Configuration saved in ds9_scratch/tr

CPU times: user 1h 14min 28s, sys: 27min 27s, total: 1h 41min 56s
Wall time: 1h 41min 48s


TrainOutput(global_step=30380, training_loss=2.769782796009659, metrics={'train_runtime': 6107.9702, 'train_samples_per_second': 318.301, 'train_steps_per_second': 4.974, 'total_flos': 1.1357226424147968e+16, 'train_loss': 2.769782796009659, 'epoch': 5.0})

In [43]:
trainer.save_model(os.path.join(PATH_MODEL, "epoch_5"))

Saving model checkpoint to ds9_scratch/epoch_5
Configuration saved in ds9_scratch/epoch_5/config.json
Model weights saved in ds9_scratch/epoch_5/pytorch_model.bin


### Test model again after 5 epochs

In [52]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=os.path.join(PATH_MODEL, "epoch_5"),
    tokenizer=os.path.join(PATH_MODEL, "epoch_5")
)

loading configuration file ds9_scratch/epoch_5/config.json
Model config RobertaConfig {
  "_name_or_path": "ds9_scratch/epoch_5",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ds9_scratch/epoch_5/config.json
Model config RobertaConfig {
  "_name_or_path": "ds9_scratch/epoch_5",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id

In [24]:
fill_mask("He is drinking a cup of <mask>.")

[{'score': 0.019097313284873962,
  'token': 578,
  'token_str': ' them',
  'sequence': 'He is drinking a cup of them.'},
 {'score': 0.017102451995015144,
  'token': 434,
  'token_str': ' him',
  'sequence': 'He is drinking a cup of him.'},
 {'score': 0.013738157227635384,
  'token': 355,
  'token_str': ' it',
  'sequence': 'He is drinking a cup of it.'},
 {'score': 0.010209030471742153,
  'token': 1777,
  'token_str': ' pain',
  'sequence': 'He is drinking a cup of pain.'},
 {'score': 0.007656184956431389,
  'token': 436,
  'token_str': ' her',
  'sequence': 'He is drinking a cup of her.'}]

In [25]:
fill_mask("Some beeps and the image of Gul <mask> appears on the monitors")

[{'score': 0.09750576317310333,
  'token': 473,
  'token_str': ' Sisko',
  'sequence': 'Some beeps and the image of Gul Sisko appears on the monitors'},
 {'score': 0.08002620190382004,
  'token': 564,
  'token_str': ' Odo',
  'sequence': 'Some beeps and the image of Gul Odo appears on the monitors'},
 {'score': 0.07750523090362549,
  'token': 588,
  'token_str': ' Bashir',
  'sequence': 'Some beeps and the image of Gul Bashir appears on the monitors'},
 {'score': 0.06540872901678085,
  'token': 587,
  'token_str': ' Quark',
  'sequence': 'Some beeps and the image of Gul Quark appears on the monitors'},
 {'score': 0.06175588071346283,
  'token': 584,
  'token_str': ' Kira',
  'sequence': 'Some beeps and the image of Gul Kira appears on the monitors'}]

In [26]:
fill_mask("Two to <mask> up.")

[{'score': 0.057491105049848557,
  'token': 557,
  'token_str': ' get',
  'sequence': 'Two to get up.'},
 {'score': 0.03225880116224289,
  'token': 1420,
  'token_str': ' pick',
  'sequence': 'Two to pick up.'},
 {'score': 0.027884643524885178,
  'token': 1009,
  'token_str': ' come',
  'sequence': 'Two to come up.'},
 {'score': 0.025538945570588112,
  'token': 450,
  'token_str': ' look',
  'sequence': 'Two to look up.'},
 {'score': 0.02337310090661049,
  'token': 1116,
  'token_str': ' keep',
  'sequence': 'Two to keep up.'}]

In [53]:
fill_mask("Commander <mask> is sitting behind his desk.")

[{'score': 0.07920032739639282,
  'token': 473,
  'token_str': ' Sisko',
  'sequence': 'Commander Sisko is sitting behind his desk.'},
 {'score': 0.07390900701284409,
  'token': 588,
  'token_str': ' Bashir',
  'sequence': 'Commander Bashir is sitting behind his desk.'},
 {'score': 0.06426364928483963,
  'token': 587,
  'token_str': ' Quark',
  'sequence': 'Commander Quark is sitting behind his desk.'},
 {'score': 0.042869631201028824,
  'token': 564,
  'token_str': ' Odo',
  'sequence': 'Commander Odo is sitting behind his desk.'},
 {'score': 0.03325772285461426,
  'token': 584,
  'token_str': ' Kira',
  'sequence': 'Commander Kira is sitting behind his desk.'}]

In [28]:
fill_mask("The Ferengis follow the Rules of <mask>.")

[{'score': 0.15817765891551971,
  'token': 578,
  'token_str': ' them',
  'sequence': 'The Ferengis follow the Rules of them.'},
 {'score': 0.05988244712352753,
  'token': 314,
  'token_str': ' you',
  'sequence': 'The Ferengis follow the Rules of you.'},
 {'score': 0.05488232523202896,
  'token': 355,
  'token_str': ' it',
  'sequence': 'The Ferengis follow the Rules of it.'},
 {'score': 0.03678121045231819,
  'token': 643,
  'token_str': ' us',
  'sequence': 'The Ferengis follow the Rules of us.'},
 {'score': 0.03402514010667801,
  'token': 434,
  'token_str': ' him',
  'sequence': 'The Ferengis follow the Rules of him.'}]

In [29]:
fill_mask("In the Bajoran religion, the <mask> is worshipped as the Celestial Temple of the Prophets")

[{'score': 0.05264582857489586,
  'token': 752,
  'token_str': ' ship',
  'sequence': 'In the Bajoran religion, the ship is worshipped as the Celestial Temple of the Prophets'},
 {'score': 0.04383635148406029,
  'token': 982,
  'token_str': ' room',
  'sequence': 'In the Bajoran religion, the room is worshipped as the Celestial Temple of the Prophets'},
 {'score': 0.01855940744280815,
  'token': 1173,
  'token_str': ' bar',
  'sequence': 'In the Bajoran religion, the bar is worshipped as the Celestial Temple of the Prophets'},
 {'score': 0.01752207800745964,
  'token': 1505,
  'token_str': ' Defiant',
  'sequence': 'In the Bajoran religion, the Defiant is worshipped as the Celestial Temple of the Prophets'},
 {'score': 0.01479556504637003,
  'token': 792,
  'token_str': ' door',
  'sequence': 'In the Bajoran religion, the door is worshipped as the Celestial Temple of the Prophets'}]

### Train some more

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=os.path.join(PATH_MODEL, "transformer"),
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=64,
    save_steps=1000,
    prediction_loss_only=True,
    save_total_limit=5
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [16]:
%%time
trainer.train(os.path.join(PATH_MODEL, "transformer", "checkpoint-81000"))

Loading model from ds9_scratch/transformer/checkpoint-81000.
***** Running training *****
  Num examples = 522889
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 81710
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 9
  Continuing training from global step 81000
  Will skip the first 9 epochs then the first 7461 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/7461 [00:00<?, ?it/s]

Step,Training Loss
81500,3.2376




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 2min 32s, sys: 47.7 s, total: 3min 20s
Wall time: 3min 19s


TrainOutput(global_step=81710, training_loss=0.028166226928695998, metrics={'train_runtime': 199.225, 'train_samples_per_second': 26246.147, 'train_steps_per_second': 410.139, 'total_flos': 3.059910036677645e+16, 'train_loss': 0.028166226928695998, 'epoch': 10.0})

In [18]:
trainer.save_model(os.path.join(PATH_MODEL, "epoch_10"))

Saving model checkpoint to ds9_scratch/epoch_10
Configuration saved in ds9_scratch/epoch_10/config.json
Model weights saved in ds9_scratch/epoch_10/pytorch_model.bin


### Test model again after 10 epochs

In [7]:
import os
from transformers import pipeline
PATH_MODEL = 'ds9_scratch'

fill_mask = pipeline(
    "fill-mask",
    model=os.path.join(PATH_MODEL, "epoch_10"),
    tokenizer=os.path.join(PATH_MODEL, "epoch_10")
)

In [8]:
fill_mask("He is drinking a cup of <mask>.")

[{'score': 0.13619180023670197,
  'token': 4305,
  'token_str': ' coffee',
  'sequence': 'He is drinking a cup of coffee.'},
 {'score': 0.03757583349943161,
  'token': 2394,
  'token_str': ' food',
  'sequence': 'He is drinking a cup of food.'},
 {'score': 0.022191287949681282,
  'token': 2319,
  'token_str': ' latinum',
  'sequence': 'He is drinking a cup of latinum.'},
 {'score': 0.018649088218808174,
  'token': 5816,
  'token_str': ' tea',
  'sequence': 'He is drinking a cup of tea.'},
 {'score': 0.014687449671328068,
  'token': 6112,
  'token_str': ' raktajino',
  'sequence': 'He is drinking a cup of raktajino.'}]

In [9]:
fill_mask("Some beeps and the image of Gul <mask> appears on the monitors")

[{'score': 0.6340900659561157,
  'token': 1018,
  'token_str': ' Dukat',
  'sequence': 'Some beeps and the image of Gul Dukat appears on the monitors'},
 {'score': 0.005333753302693367,
  'token': 2378,
  'token_str': ' Winn',
  'sequence': 'Some beeps and the image of Gul Winn appears on the monitors'},
 {'score': 0.004754691384732723,
  'token': 10674,
  'token_str': ' Toran',
  'sequence': 'Some beeps and the image of Gul Toran appears on the monitors'},
 {'score': 0.0038876133039593697,
  'token': 565,
  'token_str': ' Kira',
  'sequence': 'Some beeps and the image of Gul Kira appears on the monitors'},
 {'score': 0.0031886452343314886,
  'token': 9628,
  'token_str': ' Pran',
  'sequence': 'Some beeps and the image of Gul Pran appears on the monitors'}]

In [10]:
fill_mask("Two to <mask> up.")

[{'score': 0.08224309980869293,
  'token': 2170,
  'token_str': ' beam',
  'sequence': 'Two to beam up.'},
 {'score': 0.06752756983041763,
  'token': 1011,
  'token_str': ' come',
  'sequence': 'Two to come up.'},
 {'score': 0.054405760020017624,
  'token': 1296,
  'token_str': ' give',
  'sequence': 'Two to give up.'},
 {'score': 0.05349339172244072,
  'token': 1351,
  'token_str': ' set',
  'sequence': 'Two to set up.'},
 {'score': 0.04312761873006821,
  'token': 1447,
  'token_str': ' pick',
  'sequence': 'Two to pick up.'}]

In [11]:
fill_mask("Commander <mask> is sitting behind his desk.")

[{'score': 0.11994579434394836,
  'token': 465,
  'token_str': ' Sisko',
  'sequence': 'Commander Sisko is sitting behind his desk.'},
 {'score': 0.08897235989570618,
  'token': 595,
  'token_str': ' Bashir',
  'sequence': 'Commander Bashir is sitting behind his desk.'},
 {'score': 0.06675644963979721,
  'token': 572,
  'token_str': ' Quark',
  'sequence': 'Commander Quark is sitting behind his desk.'},
 {'score': 0.06578808277845383,
  'token': 556,
  'token_str': ' Odo',
  'sequence': 'Commander Odo is sitting behind his desk.'},
 {'score': 0.05795513466000557,
  'token': 471,
  'token_str': ' SISKO',
  'sequence': 'Commander SISKO is sitting behind his desk.'}]

In [12]:
fill_mask("The Ferengis follow the Rules of <mask>.")

[{'score': 0.5495705604553223,
  'token': 3570,
  'token_str': ' Acquisition',
  'sequence': 'The Ferengis follow the Rules of Acquisition.'},
 {'score': 0.022009525448083878,
  'token': 5141,
  'token_str': ' Mogh',
  'sequence': 'The Ferengis follow the Rules of Mogh.'},
 {'score': 0.01585402898490429,
  'token': 2319,
  'token_str': ' latinum',
  'sequence': 'The Ferengis follow the Rules of latinum.'},
 {'score': 0.01244649849832058,
  'token': 583,
  'token_str': ' them',
  'sequence': 'The Ferengis follow the Rules of them.'},
 {'score': 0.00935013685375452,
  'token': 1340,
  'token_str': ' course',
  'sequence': 'The Ferengis follow the Rules of course.'}]

In [14]:
fill_mask("""In the Bajoran religion, the <mask> is worshipped 
             as the Celestial Temple of the Prophets""")

[{'score': 0.0455084890127182,
  'token': 784,
  'token_str': ' man',
  'sequence': 'In the Bajoran religion, the man is worshipped \n             as the Celestial Temple of the Prophets'},
 {'score': 0.0448014922440052,
  'token': 749,
  'token_str': ' ship',
  'sequence': 'In the Bajoran religion, the ship is worshipped \n             as the Celestial Temple of the Prophets'},
 {'score': 0.03111078403890133,
  'token': 970,
  'token_str': ' room',
  'sequence': 'In the Bajoran religion, the room is worshipped \n             as the Celestial Temple of the Prophets'},
 {'score': 0.023898139595985413,
  'token': 1832,
  'token_str': ' crowd',
  'sequence': 'In the Bajoran religion, the crowd is worshipped \n             as the Celestial Temple of the Prophets'},
 {'score': 0.0226119477301836,
  'token': 2770,
  'token_str': ' Kai',
  'sequence': 'In the Bajoran religion, the Kai is worshipped \n             as the Celestial Temple of the Prophets'}]

### Tensorboard

run in terminal

tensorboard dev upload --logdir ds9_scratch/transformer/runs
https://tensorboard.dev/experiment/Vo0w6k22TRacVuzZkogjrg