In [1]:
#@title Step 2:Installing Hugging Face Transformers

# Decided to Keep Tensorflow - feel free to delete if it gives any issues

# Install 'transformers' from master
# !pip install git+https://github.com/huggingface/transformers
# !pip install git+https://github.com/huggingface/accelerate
# !pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.9.1
# tokenizers version at notebook update --- 0.7.0
!pip install -U transformers
!pip install -U accelerate



[0m

In [2]:
#@ Main Imports for functionality
import os
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import transformers
import accelerate

from transformers import Trainer, TrainingArguments
from transformers import pipeline
from transformers import DataCollatorForLanguageModeling
from transformers import LineByLineTextDataset
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizer
from transformers import RobertaConfig


In [3]:
#@title Step 3: Training a Tokenizer
%%time

paths = [str(x) for x in Path(".").glob("**/*.txt")]
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=52_000,
min_frequency=2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])

CPU times: user 5.74 s, sys: 251 ms, total: 5.99 s
Wall time: 5.11 s


In [4]:
#@title Step 4: Saving the files to disk

token_dir = '/content/KantaiBERT'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [5]:
#@title Step 5 Loading the Trained Tokenizer Files
tokenizer = ByteLevelBPETokenizer("./KantaiBERT/vocab.json", "./KantaiBERT/merges.txt" )
tokenizer.encode("The Critique of Pure Reason.")

Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
#@title Step 7: Defining the configuration of the Model

Original Configuration
config = RobertaConfig(
vocab_size=52_000,
max_position_embeddings=514,
num_attention_heads=12,
num_hidden_layers=6,
type_vocab_size=1,
)

In [16]:
#@title Step 8: Re-creating the Tokenizer in Transformers
tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)

In [17]:
#@title Step 9: Initializing a Model From Scratch

model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(26000, 768, padding_idx=1)
      (position_embeddings): Embedding(257, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-2): 3 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [18]:
#@title Step 10: Building the Dataset
%%time
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path="./kant.txt",
block_size=128,
)



CPU times: user 16.3 s, sys: 146 ms, total: 16.4 s
Wall time: 16.6 s


In [19]:
#@title Step 11: Defining a Data Collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [20]:
#@title Step 12: Initializing the Trainer

training_args = TrainingArguments(
output_dir="./KantaiBERT",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=64,
save_steps=10_000,
save_total_limit=2,
)

trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
#@title Step 13: Pre-training the Model
%%time
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [14]:
#@title Step 14: Saving the Final Model(+tokenizer + config) to
disk
trainer.save_model("./KantaiBERT")

NameError: name 'disk' is not defined

In [None]:
#@title Step 15: Language Modeling with
the FillMaskPipeline
fill_mask = pipeline(
"fill-mask",
model="./KantaiBERT",
tokenizer="./KantaiBERT"
)

In [None]:
fill_mask("Human thinking involves human <mask>.")