In [19]:
import sentencepiece as spm

In [20]:
[i for i in dir(spm) if not i.startswith("__")]

['BytesIO',
 'ImmutableNBestSentencePieceText',
 'ImmutableSentencePieceText',
 'ImmutableSentencePieceText_ImmutableSentencePiece',
 'SentencePieceProcessor',
 'SentencePieceTrainer',
 'SentencePieceTrainer__TrainFromMap',
 'SentencePieceTrainer__TrainFromMap2',
 'SentencePieceTrainer__TrainFromMap3',
 'SentencePieceTrainer__TrainFromMap4',
 'SentencePieceTrainer__TrainFromString',
 'SetRandomGeneratorSeed',
 'StringIO',
 '_LogStream',
 '_SwigNonDynamicMeta',
 '_add_snake_case',
 '_batchnize',
 '_sentencepiece',
 '_sentencepiece_processor_init_native',
 '_swig_add_metaclass',
 '_swig_python_version_info',
 '_swig_repr',
 '_swig_setattr_nondynamic_class_variable',
 '_swig_setattr_nondynamic_instance_variable',
 '_version',
 'csv',
 'm',
 'os',
 're',
 'set_random_generator_seed',
 'sys']

# Load Model

In [21]:
model_path = "Llama-2-70b-chat-hf/tokenizer.model"

In [22]:
sp_model = spm.SentencePieceProcessor(model_file=model_path)
sp_model

<sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x000001EF48DDCBD0> >

In [38]:
[i for i in dir(sp_model) if not i.startswith("_")]

['CalculateEntropy',
 'Decode',
 'DecodeIds',
 'DecodeIdsAsImmutableProto',
 'DecodeIdsAsSerializedProto',
 'DecodePieces',
 'DecodePiecesAsImmutableProto',
 'DecodePiecesAsSerializedProto',
 'Detokenize',
 'Encode',
 'EncodeAsIds',
 'EncodeAsImmutableProto',
 'EncodeAsPieces',
 'EncodeAsSerializedProto',
 'GetPieceSize',
 'GetScore',
 'IdToPiece',
 'Init',
 'IsByte',
 'IsControl',
 'IsUnknown',
 'IsUnused',
 'Load',
 'LoadFromFile',
 'LoadFromSerializedProto',
 'LoadVocabulary',
 'NBestEncode',
 'NBestEncodeAsIds',
 'NBestEncodeAsImmutableProto',
 'NBestEncodeAsPieces',
 'NBestEncodeAsSerializedProto',
 'PieceToId',
 'ResetVocabulary',
 'SampleEncodeAndScore',
 'SampleEncodeAndScoreAsIds',
 'SampleEncodeAndScoreAsImmutableProto',
 'SampleEncodeAndScoreAsPieces',
 'SampleEncodeAndScoreAsSerializedProto',
 'SampleEncodeAsIds',
 'SampleEncodeAsImmutableProto',
 'SampleEncodeAsPieces',
 'SampleEncodeAsSerializedProto',
 'SetDecodeExtraOptions',
 'SetEncodeExtraOptions',
 'SetVocabulary',


In [24]:
sp_model.bos_id(), sp_model.eos_id(), sp_model.pad_id(), sp_model.unk_id()

(1, 2, -1, 0)

In [37]:
sp_model.Decode([1]), sp_model.Decode([2]), sp_model.Decode([0])

('', '', ' ⁇ ')

In [26]:
sp_model.vocab_size()

32000

In [27]:
sp_model.get_piece_size(), sp_model.GetPieceSize()

(32000, 32000)

# encode / tokenize

In [28]:
inputs = [
    "The quick brown fox jumps over the lazy dog",
    "一二三四五六七八九十",
]

In [40]:
sp_model.tokenize(input=inputs)

[[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203],
 [29871, 30287, 30685, 30457, 30928, 30904, 31304, 31425, 31044, 31321, 30802]]

In [41]:
sp_model.Tokenize(input=inputs)

[[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203],
 [29871, 30287, 30685, 30457, 30928, 30904, 31304, 31425, 31044, 31321, 30802]]

In [29]:
ids = sp_model.encode(input=inputs)
ids

[[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203],
 [29871, 30287, 30685, 30457, 30928, 30904, 31304, 31425, 31044, 31321, 30802]]

In [30]:
ids = sp_model.Encode(input=inputs)
ids

[[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203],
 [29871, 30287, 30685, 30457, 30928, 30904, 31304, 31425, 31044, 31321, 30802]]

In [31]:
# 调用的Encode
sp_model.encode_as_ids(input=inputs)

[[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203],
 [29871, 30287, 30685, 30457, 30928, 30904, 31304, 31425, 31044, 31321, 30802]]

In [32]:
# 调用的Encode
sp_model.EncodeAsIds(input=inputs)

[[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203],
 [29871, 30287, 30685, 30457, 30928, 30904, 31304, 31425, 31044, 31321, 30802]]

# decode

In [33]:
sp_model.decode(ids)

['The quick brown fox jumps over the lazy dog', '一二三四五六七八九十']

In [34]:
sp_model.Decode(ids)

['The quick brown fox jumps over the lazy dog', '一二三四五六七八九十']

In [35]:
# 调用的Decode
sp_model.decode_ids(ids)

['The quick brown fox jumps over the lazy dog', '一二三四五六七八九十']

In [36]:
# 调用的Decode
sp_model.DecodeIds(ids)

['The quick brown fox jumps over the lazy dog', '一二三四五六七八九十']