In [1]:
from transformers import AutoTokenizer

# Load Model

In [2]:
model_path = "Llama-2-70b-chat-hf"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer

LlamaTokenizerFast(name_or_path='Llama-2-70b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [4]:
[i for i in dir(tokenizer) if not i.startswith("_")]

['SPECIAL_TOKENS_ATTRIBUTES',
 'add_bos_token',
 'add_eos_token',
 'add_special_tokens',
 'add_tokens',
 'added_tokens_decoder',
 'added_tokens_encoder',
 'additional_special_tokens',
 'additional_special_tokens_ids',
 'all_special_ids',
 'all_special_tokens',
 'all_special_tokens_extended',
 'apply_chat_template',
 'as_target_tokenizer',
 'backend_tokenizer',
 'batch_decode',
 'batch_encode_plus',
 'bos_token',
 'bos_token_id',
 'build_inputs_with_special_tokens',
 'can_save_slow_tokenizer',
 'chat_template',
 'clean_up_tokenization',
 'clean_up_tokenization_spaces',
 'cls_token',
 'cls_token_id',
 'convert_added_tokens',
 'convert_ids_to_tokens',
 'convert_tokens_to_ids',
 'convert_tokens_to_string',
 'create_token_type_ids_from_sequences',
 'decode',
 'decoder',
 'default_chat_template',
 'encode',
 'encode_plus',
 'eos_token',
 'eos_token_id',
 'from_pretrained',
 'get_added_vocab',
 'get_special_tokens_mask',
 'get_vocab',
 'init_inputs',
 'init_kwargs',
 'is_fast',
 'mask_token',

In [5]:
tokenizer.all_special_ids

[1, 2, 0]

In [6]:
tokenizer.all_special_tokens

['<s>', '</s>', '<unk>']

In [7]:
tokenizer.bos_token, tokenizer.bos_token_id

('<s>', 1)

In [8]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 2)

In [9]:
tokenizer.unk_token, tokenizer.unk_token_id

('<unk>', 0)

In [10]:
tokenizer.pad_token, tokenizer.pad_token_id

(None, None)

In [11]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token, tokenizer.pad_token_id

('</s>', 2)

In [12]:
tokenizer.vocab_size

32000

In [13]:
sequences = [
    "The quick brown fox jumps over the lazy dog",
    "零一二三四五六七八九十",
]

# tokenize(sequence) 对文本进行分词

In [14]:
# tokenize会将多维数据展平
tokenizer.tokenize(sequences)

['▁The',
 '▁quick',
 '▁brown',
 '▁fo',
 'x',
 '▁j',
 'umps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog',
 '▁',
 '<0xE9>',
 '<0x9B>',
 '<0xB6>',
 '一',
 '二',
 '三',
 '四',
 '五',
 '六',
 '七',
 '八',
 '九',
 '十']

In [15]:
# 因此需要使用单条数据
tokens = tokenizer.tokenize(sequences[0])
tokens

['▁The',
 '▁quick',
 '▁brown',
 '▁fo',
 'x',
 '▁j',
 'umps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog']

# convert_tokens_to_ids 将分词后的token映射为数字

In [16]:
tokenizer.convert_tokens_to_ids(tokens)

[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203]

# encode = tokenize + convert_tokens_to_ids

In [17]:
tokenizer.encode(sequences[0], add_special_tokens=False)

[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203]

In [18]:
ids = tokenizer.encode(sequences[0], add_special_tokens=True)
ids

[1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203]

# batch_encode_plus = batch encode

In [19]:
tokenizer.batch_encode_plus(sequences, add_special_tokens=True, padding=True)

{'input_ids': [[1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 2, 2, 2], [1, 29871, 236, 158, 185, 30287, 30685, 30457, 30928, 30904, 31304, 31425, 31044, 31321, 30802]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# convert_ids_to_tokens 将数字映射为token

In [20]:
tokenizer.convert_ids_to_tokens(ids)

['<s>',
 '▁The',
 '▁quick',
 '▁brown',
 '▁fo',
 'x',
 '▁j',
 'umps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog']

In [21]:
tokens_ = tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
tokens_

['▁The',
 '▁quick',
 '▁brown',
 '▁fo',
 'x',
 '▁j',
 'umps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog']

# convert_tokens_to_string 将token转换为string

In [22]:
tokenizer.convert_tokens_to_string(tokens_)

'The quick brown fox jumps over the lazy dog'

# decode = convert_ids_to_tokens + convert_tokens_to_string

In [23]:
tokenizer.decode(ids)

'<s> The quick brown fox jumps over the lazy dog'

In [24]:
tokenizer.decode(ids, skip_special_tokens=True)

'The quick brown fox jumps over the lazy dog'

# batch_decode = batch decode

In [25]:
tokenizer.batch_decode([ids])

['<s> The quick brown fox jumps over the lazy dog']

In [26]:
tokenizer.batch_decode([ids], skip_special_tokens=True)

['The quick brown fox jumps over the lazy dog']

# tokenizer([sequence])

In [27]:
inputs = tokenizer(
    sequences,                          # 单个句子
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    max_length = 2048,                  # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应总长度长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[    1,   450,  4996, 17354,  1701, 29916,   432, 17204,   975,   278,
         17366, 11203,     2,     2,     2],
        [    1, 29871,   236,   158,   185, 30287, 30685, 30457, 30928, 30904,
         31304, 31425, 31044, 31321, 30802]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([15, 15])


In [28]:
tokenizer.batch_decode(inputs["input_ids"])

['<s> The quick brown fox jumps over the lazy dog</s></s></s>',
 '<s> 零一二三四五六七八九十']

In [29]:
tokenizer.batch_decode(inputs["input_ids"], skip_special_tokens=True)

['The quick brown fox jumps over the lazy dog', '零一二三四五六七八九十']