In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tokenizers import BertWordPieceTokenizer
from transformers import AlbertConfig
from transformers import AlbertForMaskedLM
from transformers import BertTokenizerFast
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import tensorboard


## 分词器训练

In [2]:
files = "./text.txt" # 训练文本文件
vocab_size = 20000
min_frequency = 1
limit_alphabet = 20000
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] #适用于Bert和Albert

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
)

# Customize training
tokenizer.train(
    files,
    vocab_size = vocab_size,
    min_frequency=min_frequency,
    show_progress=True,
    special_tokens=special_tokens,
    limit_alphabet=limit_alphabet,
    wordpieces_prefix="##"
    )
    
# !mkdir tokenizer
tokenizer.save_model("tokenizer")  # to vocab.txt
tokenizer.save("./tokenizer/tokenizer")

## 模型配置

In [3]:
config = AlbertConfig(
    vocab_size = 20000,
    embedding_size = 256,
    hidden_size = 768,
    num_hidden_layers = 6,
    num_attention_heads = 12,
    intermediate_size = 3072,
    hidden_act = "gelu",
    hidden_dropout_prob = 0.1,
    attention_probs_dropout_prob = 0.1,
    max_position_embeddings = 512,
    type_vocab_size = 2,
    initializer_range = 0.02,
    layer_norm_eps = 1e-12,
)

model = AlbertForMaskedLM(config=config)
model.num_parameters()  # 查看参数量

13345312

## 导入训练好的分词器

In [4]:
%%time
tokenizer = BertTokenizerFast.from_pretrained("./tokenizer", max_len=100)
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./text.txt",
    block_size=50,
)

CPU times: user 12.8 s, sys: 40.8 s, total: 53.6 s
Wall time: 3.59 s


In [4]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


training_args = TrainingArguments(
    output_dir="./textAlbert",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_gpu_train_batch_size=16,
    save_steps=2000,
    save_total_limit=2,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [5]:
%%time
# 启动训练
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=6250, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Iteration', max=6250, style=ProgressStyle(description_width='…



CPU times: user 18min 48s, sys: 53.1 s, total: 19min 41s
Wall time: 13min 32s


TrainOutput(global_step=12500, training_loss=5.10019878900528)