In [1]:
from pathlib import Path

from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import os

root_path = '/run/media/holmium/DATA/TwitterPLM/'

paths = [os.path.join(root_path, 'line_tmp.txt')]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=8_000, min_frequency=2, special_tokens=[
    '[CLS]', '[SEP]', '[MASK]', '[PAD]', '[UNK]'
])

# Save files to disk
tokenizer.save_model(os.path.join(root_path, 'tokenizer'))

['/run/media/holmium/DATA/TwitterPLM/tokenizer/vocab.json',
 '/run/media/holmium/DATA/TwitterPLM/tokenizer/merges.txt']

In [3]:
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizerFast
from transformers import RobertaConfig
# tokenizer._tokenizer.post_processor = BertProcessing(
#     ("[SEP]", tokenizer.token_to_id("[SEP]")),
#     ("[CLS]", tokenizer.token_to_id("[CLS]")),
# )
# tokenizer.enable_truncation(max_length=512)
config = RobertaConfig(
    vocab_size=8_000,
    max_position_embeddings=66,
    num_attention_heads=3,
    num_hidden_layers=3,
    type_vocab_size=1,
)
tokenizer = RobertaTokenizerFast.from_pretrained(os.path.join(root_path,'tokenizer'), max_len=128)
model = RobertaForMaskedLM(config=config)
model.num_parameters()
# => 84 million parameters

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


28060736

In [8]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=os.path.join(root_path,'line_tmp.txt'),
    block_size=64,
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
training_args = TrainingArguments(
    output_dir=os.path.join(root_path,'model'),
    overwrite_output_dir=True,
    num_train_epochs=30,
    per_gpu_train_batch_size=8,
    save_steps=10,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)



In [9]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss


TrainOutput(global_step=300, training_loss=6.657333577473958, metrics={'train_runtime': 76.3188, 'train_samples_per_second': 3.931, 'total_flos': 25467475021824.0, 'epoch': 30.0})

In [10]:
trainer.save_model(os.path.join(root_path,'model'))

In [14]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=os.path.join(root_path,'model'),
    tokenizer=tokenizer
)

In [17]:
fill_mask('RT to fap with a nude pussy in DM 💦<mask>💦')

[{'sequence': 'RT to fap with a nude pussy in DM 💦 @💦',
  'score': 0.02807973325252533,
  'token': 265,
  'token_str': ' @'},
 {'sequence': 'RT to fap with a nude pussy in DM 💦 💦',
  'score': 0.027658136561512947,
  'token': 225,
  'token_str': ' '},
 {'sequence': 'RT to fap with a nude pussy in DM 💦.💦',
  'score': 0.021921377629041672,
  'token': 18,
  'token_str': '.'},
 {'sequence': 'RT to fap with a nude pussy in DM 💦/💦',
  'score': 0.01836058869957924,
  'token': 19,
  'token_str': '/'},
 {'sequence': 'RT to fap with a nude pussy in DM 💦:💦',
  'score': 0.015311868861317635,
  'token': 30,
  'token_str': ':'}]