### 端到端示例

In [3]:
import sentencepiece as spm

# 用红楼梦.txt训练一个sentencepiece模型，模型前缀model_prefix=meng, 会生成meng.model, meng.vocab.
# meng.vocab仅仅是一个参考，在分词中并未使用。
spm.SentencePieceTrainer.train('--input=VexRiscv.v --model_prefix=verilog --vocab_size=512')

# 实例化一个分词实例，然后加载训练好的meng.model
sp = spm.SentencePieceProcessor()
sp.load('verilog.model')

# encode: text => id
print(sp.encode_as_pieces('module Vexriscv'))
print(sp.encode_as_ids('module Vexriscv'))

# decode: id => text
print(sp.decode_pieces(['▁', 'module', '▁', 'V', 'ex', 'r', 'isc', 'v']))
print(sp.decode_ids([4, 481, 4, 501, 23, 510, 438, 421]))

['▁', 'module', '▁', 'V', 'ex', 'r', 'isc', 'v']
[4, 481, 4, 501, 23, 510, 438, 421]
module Vexriscv
module Vexriscv


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=VexRiscv.v --model_prefix=verilog --vocab_size=512
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: VexRiscv.v
  input_format: 
  model_prefix: verilog
  model_type: UNIGRAM
  vocab_size: 512
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: 

In [6]:
# 返回 vocab size
print(f"词表大小={sp.get_piece_size()}")

print(sp.encode_as_ids("reg"))
# id <=> piece conversion
print(sp.id_to_piece(77))
print(sp.piece_to_id('reg'))

# id=0的位置留着给UNK token, 可对其进行修改
print(sp.piece_to_id('__MUST_BE_UNKNOWN__'))

# 控制符 unk, <s>, </s> 默认id对应（0,1,2）
for id in range(3):
      print(sp.id_to_piece(id), sp.is_control(id))

词表大小=512
[4, 77]
reg
77
0
<unk> False
<s> True
</s> True


In [7]:
# 加载一个社区训练好的tokenizer对比下。

from sentencepiece import SentencePieceProcessor
model_path = "../tokenizer.model"
sp_model = SentencePieceProcessor(model_file=model_path)
print(f"Loaded SentencePiece model from {model_path}")

# BOS / EOS token IDs
n_words: int = sp_model.vocab_size()
bos_id: int = sp_model.bos_id()
eos_id: int = sp_model.eos_id()
pad_id: int = sp_model.pad_id()
unk_id: int = sp_model.unk_id()
print(f"#words: {n_words} - BOS ID: {bos_id} - EOS ID: {eos_id} - PAD ID: {pad_id} - UNK ID : {unk_id}")


model_path = "verilog.model"
sp_model = SentencePieceProcessor(model_file=model_path)
print(f"Loaded SentencePiece model from {model_path}")

# BOS / EOS token IDs
n_words: int = sp_model.vocab_size()
bos_id: int = sp_model.bos_id()
eos_id: int = sp_model.eos_id()
pad_id: int = sp_model.pad_id()
print(f"#words: {n_words} - BOS ID: {bos_id} - EOS ID: {eos_id} - PAD ID: {pad_id} - UNK ID : {unk_id}")


Loaded SentencePiece model from ../tokenizer.model
#words: 32000 - BOS ID: 1 - EOS ID: 2 - PAD ID: -1 - UNK ID : 0
Loaded SentencePiece model from verilog.model
#words: 512 - BOS ID: 1 - EOS ID: 2 - PAD ID: -1 - UNK ID : 0


### BPE (Byte pair encoding) model

可通过 -model_type=bpe 指定model类型

In [16]:
import sentencepiece as spm
spm.SentencePieceTrainer.train('--input=VexRiscv.v --model_prefix=verilog --vocab_size=1024 --model_type=bpe')
sp_bpe = spm.SentencePieceProcessor()
sp_bpe.load('verilog.model')

print('*** BPE ***')
print(sp_bpe.encode_as_pieces('assign ptrDif = (pushPtr_value - popPtr_value);'))
print(sp_bpe.nbest_encode_as_pieces('assign ptrDif = (pushPtr_value - popPtr_value);', 5))  # returns an empty list.

# encode: text => id
print(sp_bpe.encode_as_pieces('assign ptrDif = (pushPtr_value - popPtr_value);'))
print(sp_bpe.encode_as_ids('assign ptrDif = (pushPtr_value - popPtr_value);'))

# decode: id => text
# print(sp_bpe.decode_pieces())
# print(sp_bpe.decode_ids())

*** BPE ***
['▁assign', '▁ptr', 'D', 'if', '▁=', '▁(', 'pushPtr', '_', 'value', '▁', '-', '▁popPtr', '_', 'value', ');']
[]
['▁assign', '▁ptr', 'D', 'if', '▁=', '▁(', 'pushPtr', '_', 'value', '▁', '-', '▁popPtr', '_', 'value', ');']
[61, 898, 973, 109, 19, 77, 464, 943, 324, 945, 0, 563, 943, 324, 91]


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=VexRiscv.v --model_prefix=verilog --vocab_size=1024 --model_type=bpe
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: VexRiscv.v
  input_format: 
  model_prefix: verilog
  model_type: BPE
  vocab_size: 1024
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s