In [28]:
import sentencepiece as spm

In [29]:
[i for i in dir(spm) if not i.startswith("__")]

['BytesIO',
 'ImmutableNBestSentencePieceText',
 'ImmutableSentencePieceText',
 'ImmutableSentencePieceText_ImmutableSentencePiece',
 'SentencePieceNormalizer',
 'SentencePieceProcessor',
 'SentencePieceTrainer',
 'SetMinLogLevel',
 'SetRandomGeneratorSeed',
 'StringIO',
 '_LogStream',
 '_SwigNonDynamicMeta',
 '_add_snake_case',
 '_batchnize',
 '_sentencepiece',
 '_sentencepiece_normalizer_init_native',
 '_sentencepiece_processor_init_native',
 '_swig_add_metaclass',
 '_swig_python_version_info',
 '_swig_repr',
 '_swig_setattr_nondynamic_class_variable',
 '_swig_setattr_nondynamic_instance_variable',
 '_version',
 'csv',
 'm',
 'os',
 're',
 'set_min_log_level',
 'set_random_generator_seed',
 'sys']

# Load Model

In [30]:
model_path = "Llama-2-7b-chat-hf/tokenizer.model"

In [31]:
sp_model = spm.SentencePieceProcessor(model_file=model_path)
sp_model

<sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x000002101A136580> >

In [32]:
[i for i in dir(sp_model) if not i.startswith("_")]

['CalculateEntropy',
 'Decode',
 'DecodeIds',
 'DecodeIdsAsImmutableProto',
 'DecodeIdsAsSerializedProto',
 'DecodePieces',
 'DecodePiecesAsImmutableProto',
 'DecodePiecesAsSerializedProto',
 'Detokenize',
 'Encode',
 'EncodeAsIds',
 'EncodeAsImmutableProto',
 'EncodeAsPieces',
 'EncodeAsSerializedProto',
 'GetPieceSize',
 'GetScore',
 'IdToPiece',
 'Init',
 'IsByte',
 'IsControl',
 'IsUnknown',
 'IsUnused',
 'Load',
 'LoadFromFile',
 'LoadFromSerializedProto',
 'LoadVocabulary',
 'NBestEncode',
 'NBestEncodeAsIds',
 'NBestEncodeAsImmutableProto',
 'NBestEncodeAsPieces',
 'NBestEncodeAsSerializedProto',
 'Normalize',
 'OverrideNormalizerSpec',
 'PieceToId',
 'ResetVocabulary',
 'SampleEncodeAndScore',
 'SampleEncodeAndScoreAsIds',
 'SampleEncodeAndScoreAsImmutableProto',
 'SampleEncodeAndScoreAsPieces',
 'SampleEncodeAndScoreAsSerializedProto',
 'SampleEncodeAsIds',
 'SampleEncodeAsImmutableProto',
 'SampleEncodeAsPieces',
 'SampleEncodeAsSerializedProto',
 'SetDecodeExtraOptions',
 'S

In [33]:
sp_model.bos_id(), sp_model.eos_id(), sp_model.pad_id(), sp_model.unk_id()

(1, 2, -1, 0)

In [34]:
sp_model.Decode([1]), sp_model.Decode([2]), sp_model.Decode([0])

('', '', ' ⁇ ')

In [35]:
sp_model.vocab_size()

32000

In [36]:
sp_model.get_piece_size(), sp_model.GetPieceSize()

(32000, 32000)

In [37]:
inputs = [
    "The quick brown fox jumps over the lazy dog",
    "零一二三四五六七八九十",
]

# Encode / Tokenize

In [38]:
help(sp_model.Encode)

Help on method Encode in module sentencepiece:

Encode(input, out_type=None, add_bos=None, add_eos=None, reverse=None, emit_unk_piece=None, enable_sampling=None, nbest_size=None, alpha=None, num_threads=None) method of sentencepiece.SentencePieceProcessor instance
    Encode text input to segmented ids or tokens.
    
    Args:
    input: input string. accepsts list of string.
    out_type: output type. int or str.
    add_bos: Add <s> to the result (Default = false)
    add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
             reversing (if enabled).
    reverse: Reverses the tokenized sequence (Default = false)
    emit_unk_piece: Emits the unk literal string (Default = false)
    nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
                nbest_size = {0,1}: No sampling is performed.
                nbest_size > 1: samples from the nbest_size results.
                nbest_size < 0: assuming that nbest_size is infinite and sample

In [39]:
ids = sp_model.Encode(input=inputs, out_type=int, add_bos=True, add_eos=True)
ids

[[1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 2],
 [1,
  29871,
  236,
  158,
  185,
  30287,
  30685,
  30457,
  30928,
  30904,
  31304,
  31425,
  31044,
  31321,
  30802,
  2]]

In [40]:
sp_model.Encode(input=inputs, out_type=str, add_bos=True, add_eos=True)

[['<s>',
  '▁The',
  '▁quick',
  '▁brown',
  '▁fo',
  'x',
  '▁j',
  'umps',
  '▁over',
  '▁the',
  '▁lazy',
  '▁dog',
  '</s>'],
 ['<s>',
  '▁',
  '<0xE9>',
  '<0x9B>',
  '<0xB6>',
  '一',
  '二',
  '三',
  '四',
  '五',
  '六',
  '七',
  '八',
  '九',
  '十',
  '</s>']]

In [41]:
sp_model.Tokenize(input=inputs, out_type=int, add_bos=True, add_eos=True)

[[1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 2],
 [1,
  29871,
  236,
  158,
  185,
  30287,
  30685,
  30457,
  30928,
  30904,
  31304,
  31425,
  31044,
  31321,
  30802,
  2]]

In [42]:
# 调用的Encode,返回ids
sp_model.EncodeAsIds(input=inputs)

[[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203],
 [29871,
  236,
  158,
  185,
  30287,
  30685,
  30457,
  30928,
  30904,
  31304,
  31425,
  31044,
  31321,
  30802]]

In [43]:
# 调用的Encode,返回str
sp_model.EncodeAsPieces(input=inputs)

[['▁The',
  '▁quick',
  '▁brown',
  '▁fo',
  'x',
  '▁j',
  'umps',
  '▁over',
  '▁the',
  '▁lazy',
  '▁dog'],
 ['▁',
  '<0xE9>',
  '<0x9B>',
  '<0xB6>',
  '一',
  '二',
  '三',
  '四',
  '五',
  '六',
  '七',
  '八',
  '九',
  '十']]

# Decode

In [44]:
help(sp_model.Decode)

Help on method Decode in module sentencepiece:

Decode(input, out_type=<class 'str'>, num_threads=None) method of sentencepiece.SentencePieceProcessor instance
    Decode processed id or token sequences.
    
    Args:
      out_type: output type. str, bytes or 'serialized_proto' or 'immutable_proto' (Default = str)
      num_threads: the number of threads used in the batch processing (Default = -1).



In [48]:
ids

[[1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 2],
 [1,
  29871,
  236,
  158,
  185,
  30287,
  30685,
  30457,
  30928,
  30904,
  31304,
  31425,
  31044,
  31321,
  30802,
  2]]

In [52]:
# 默认去掉特殊token
sp_model.Decode([1, 2])

''

In [45]:
sp_model.Decode(ids)

['The quick brown fox jumps over the lazy dog', '零一二三四五六七八九十']

In [46]:
# 调用的Decode,返回str
sp_model.DecodeIds(ids)

['The quick brown fox jumps over the lazy dog', '零一二三四五六七八九十']

In [47]:
# 调用的Decode,返回str
sp_model.DecodePieces(sp_model.EncodeAsPieces(input=inputs))

['The quick brown fox jumps over the lazy dog', '零一二三四五六七八九十']