In [1]:
from transformers import AutoTokenizer

# Llama-2-7b-chat-hf

## Load Model

In [164]:
model_path = "Llama-2-7b-chat-hf"

In [165]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer

LlamaTokenizerFast(name_or_path='Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [166]:
[i for i in dir(tokenizer) if not i.startswith("_")]

['SPECIAL_TOKENS_ATTRIBUTES',
 'add_bos_token',
 'add_eos_token',
 'add_special_tokens',
 'add_tokens',
 'added_tokens_decoder',
 'added_tokens_encoder',
 'additional_special_tokens',
 'additional_special_tokens_ids',
 'all_special_ids',
 'all_special_tokens',
 'all_special_tokens_extended',
 'apply_chat_template',
 'as_target_tokenizer',
 'backend_tokenizer',
 'batch_decode',
 'batch_encode_plus',
 'bos_token',
 'bos_token_id',
 'build_inputs_with_special_tokens',
 'can_save_slow_tokenizer',
 'chat_template',
 'clean_up_tokenization',
 'clean_up_tokenization_spaces',
 'cls_token',
 'cls_token_id',
 'convert_added_tokens',
 'convert_ids_to_tokens',
 'convert_tokens_to_ids',
 'convert_tokens_to_string',
 'create_token_type_ids_from_sequences',
 'decode',
 'decoder',
 'default_chat_template',
 'encode',
 'encode_plus',
 'eos_token',
 'eos_token_id',
 'from_pretrained',
 'get_added_vocab',
 'get_special_tokens_mask',
 'get_vocab',
 'init_inputs',
 'init_kwargs',
 'is_fast',
 'mask_token',

In [167]:
tokenizer.all_special_ids

[1, 2, 0]

In [168]:
tokenizer.all_special_tokens

['<s>', '</s>', '<unk>']

In [169]:
tokenizer.bos_token, tokenizer.bos_token_id

('<s>', 1)

In [170]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 2)

In [171]:
tokenizer.unk_token, tokenizer.unk_token_id

('<unk>', 0)

In [172]:
tokenizer.pad_token, tokenizer.pad_token_id

(None, None)

In [173]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token, tokenizer.pad_token_id

('</s>', 2)

In [174]:
tokenizer.vocab_size

32000

In [175]:
sequences = [
    "The quick brown fox jumps over the lazy dog",
    "零一二三四五六七八九十",
]

## tokenize(sequence) 对文本进行分词

In [176]:
# tokenize 会将多维数据展平,因此需要使用单条数据
tokenizer.tokenize(sequences)

['▁The',
 '▁quick',
 '▁brown',
 '▁fo',
 'x',
 '▁j',
 'umps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog',
 '▁',
 '<0xE9>',
 '<0x9B>',
 '<0xB6>',
 '一',
 '二',
 '三',
 '四',
 '五',
 '六',
 '七',
 '八',
 '九',
 '十']

In [177]:
# tokenize 会将多维数据展平,因此需要使用单条数据
tokens = tokenizer.tokenize(sequences[0])
tokens

['▁The',
 '▁quick',
 '▁brown',
 '▁fo',
 'x',
 '▁j',
 'umps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog']

In [178]:
tokenizer.tokenize(sequences[1])

['▁',
 '<0xE9>',
 '<0x9B>',
 '<0xB6>',
 '一',
 '二',
 '三',
 '四',
 '五',
 '六',
 '七',
 '八',
 '九',
 '十']

In [179]:
tokenizer.tokenize("零")

['▁', '<0xE9>', '<0x9B>', '<0xB6>']

## convert_tokens_to_ids 将分词后的token映射为数字

In [180]:
tokenizer.convert_tokens_to_ids(tokens)

[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203]

## encode = tokenize + convert_tokens_to_ids

In [183]:
# encode 会将多维数据展平,因此需要使用单条数据
tokenizer.encode(sequences[0], add_special_tokens=False)

[450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203]

In [184]:
ids = tokenizer.encode(sequences[0], add_special_tokens=True)
ids

[1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203]

## batch_encode_plus = batch encode

In [185]:
tokenizer.batch_encode_plus(sequences, add_special_tokens=True, padding=True)

{'input_ids': [[1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 2, 2, 2], [1, 29871, 236, 158, 185, 30287, 30685, 30457, 30928, 30904, 31304, 31425, 31044, 31321, 30802]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## convert_ids_to_tokens 将数字映射为token

In [190]:
tokenizer.convert_ids_to_tokens(ids)

['<s>',
 '▁The',
 '▁quick',
 '▁brown',
 '▁fo',
 'x',
 '▁j',
 'umps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog']

In [191]:
tokens_ = tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
tokens_

['▁The',
 '▁quick',
 '▁brown',
 '▁fo',
 'x',
 '▁j',
 'umps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog']

## convert_tokens_to_string 将token转换为string

In [192]:
tokenizer.convert_tokens_to_string(tokens_)

'The quick brown fox jumps over the lazy dog'

## decode = convert_ids_to_tokens + convert_tokens_to_string

In [197]:
# 输入一维数据
tokenizer.decode(ids)

'<s> The quick brown fox jumps over the lazy dog'

In [198]:
tokenizer.decode(ids, skip_special_tokens=True)

'The quick brown fox jumps over the lazy dog'

## batch_decode = batch decode

In [199]:
# 输入二维数据
tokenizer.batch_decode([ids])

['<s> The quick brown fox jumps over the lazy dog']

In [200]:
tokenizer.batch_decode([ids], skip_special_tokens=True)

['The quick brown fox jumps over the lazy dog']

## tokenizer([sequence])

In [201]:
inputs = tokenizer(
    sequences,                          # 句子
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    max_length = 2048,                  # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应总长度长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[    1,   450,  4996, 17354,  1701, 29916,   432, 17204,   975,   278,
         17366, 11203,     2,     2,     2],
        [    1, 29871,   236,   158,   185, 30287, 30685, 30457, 30928, 30904,
         31304, 31425, 31044, 31321, 30802]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([15, 15])


In [204]:
tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True), tokenizer.decode(inputs["input_ids"][1], skip_special_tokens=True)

('The quick brown fox jumps over the lazy dog', '零一二三四五六七八九十')

In [203]:
tokenizer.batch_decode(inputs["input_ids"], skip_special_tokens=True)

['The quick brown fox jumps over the lazy dog', '零一二三四五六七八九十']

# Meta-Llama-3-8B-Instruct

## Load Model

In [205]:
model_path = "Meta-Llama-3-8B-Instruct"

In [206]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


PreTrainedTokenizerFast(name_or_path='Meta-Llama-3-8B-Instruct', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|reserved_special_token_2|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|res

In [207]:
[i for i in dir(tokenizer) if not i.startswith("_")]

['SPECIAL_TOKENS_ATTRIBUTES',
 'add_special_tokens',
 'add_tokens',
 'added_tokens_decoder',
 'added_tokens_encoder',
 'additional_special_tokens',
 'additional_special_tokens_ids',
 'all_special_ids',
 'all_special_tokens',
 'all_special_tokens_extended',
 'apply_chat_template',
 'as_target_tokenizer',
 'backend_tokenizer',
 'batch_decode',
 'batch_encode_plus',
 'bos_token',
 'bos_token_id',
 'build_inputs_with_special_tokens',
 'can_save_slow_tokenizer',
 'chat_template',
 'clean_up_tokenization',
 'clean_up_tokenization_spaces',
 'cls_token',
 'cls_token_id',
 'convert_added_tokens',
 'convert_ids_to_tokens',
 'convert_tokens_to_ids',
 'convert_tokens_to_string',
 'create_token_type_ids_from_sequences',
 'decode',
 'decoder',
 'default_chat_template',
 'encode',
 'encode_plus',
 'eos_token',
 'eos_token_id',
 'from_pretrained',
 'get_added_vocab',
 'get_special_tokens_mask',
 'get_vocab',
 'init_inputs',
 'init_kwargs',
 'is_fast',
 'mask_token',
 'mask_token_id',
 'max_len_sentenc

In [208]:
tokenizer.all_special_ids

[128000, 128001]

In [209]:
tokenizer.all_special_tokens

['<|begin_of_text|>', '<|end_of_text|>']

In [210]:
tokenizer.bos_token, tokenizer.bos_token_id

('<|begin_of_text|>', 128000)

In [211]:
tokenizer.eos_token, tokenizer.eos_token_id

('<|end_of_text|>', 128001)

In [212]:
tokenizer.unk_token, tokenizer.unk_token_id

(None, None)

In [213]:
tokenizer.pad_token, tokenizer.pad_token_id

(None, None)

In [214]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token, tokenizer.pad_token_id

('<|end_of_text|>', 128001)

In [215]:
tokenizer.vocab_size

128000

In [216]:
sequences = [
    "The quick brown fox jumps over the lazy dog",
    "零一二三四五六七八九十",
]

## tokenize(sequence) 对文本进行分词

In [217]:
# tokenize 会将多维数据展平,因此需要使用单条数据
tokenizer.tokenize(sequences)

['The',
 'Ġquick',
 'Ġbrown',
 'Ġfox',
 'Ġjumps',
 'Ġover',
 'Ġthe',
 'Ġlazy',
 'Ġdog',
 'éĽ¶',
 'ä¸Ģ',
 'äºĮ',
 'ä¸ī',
 'åĽĽ',
 'äºĶ',
 'åħŃ',
 'ä¸ĥ',
 'åħ«',
 'ä¹Ŀ',
 'åįģ']

In [218]:
# tokenize 会将多维数据展平,因此需要使用单条数据
tokens = tokenizer.tokenize(sequences[0])
tokens

['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog']

In [219]:
tokenizer.tokenize(sequences[1])

['éĽ¶', 'ä¸Ģ', 'äºĮ', 'ä¸ī', 'åĽĽ', 'äºĶ', 'åħŃ', 'ä¸ĥ', 'åħ«', 'ä¹Ŀ', 'åįģ']

In [220]:
tokenizer.tokenize("零")

['éĽ¶']

## convert_tokens_to_ids 将分词后的token映射为数字

In [221]:
tokenizer.convert_tokens_to_ids(tokens)

[791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679]

## encode = tokenize + convert_tokens_to_ids

In [222]:
# encode 会将多维数据展平,因此需要使用单条数据
tokenizer.encode(sequences[0], add_special_tokens=False)

[791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679]

In [223]:
ids = tokenizer.encode(sequences[0], add_special_tokens=True)
ids

[128000, 791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679]

## batch_encode_plus = batch encode

In [224]:
tokenizer.batch_encode_plus(sequences, add_special_tokens=True, padding=True)

{'input_ids': [[128000, 791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679, 128001, 128001], [128000, 110260, 15120, 41920, 46091, 64803, 76208, 103070, 103305, 102397, 103178, 95598]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## convert_ids_to_tokens 将数字映射为token

In [225]:
tokenizer.convert_ids_to_tokens(ids)

['<|begin_of_text|>',
 'The',
 'Ġquick',
 'Ġbrown',
 'Ġfox',
 'Ġjumps',
 'Ġover',
 'Ġthe',
 'Ġlazy',
 'Ġdog']

In [226]:
tokens_ = tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
tokens_

['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog']

## convert_tokens_to_string 将token转换为string

In [227]:
tokenizer.convert_tokens_to_string(tokens_)

'The quick brown fox jumps over the lazy dog'

## decode = convert_ids_to_tokens + convert_tokens_to_string

In [228]:
# 输入一维数据
tokenizer.decode(ids)

'<|begin_of_text|>The quick brown fox jumps over the lazy dog'

In [229]:
tokenizer.decode(ids, skip_special_tokens=True)

'The quick brown fox jumps over the lazy dog'

## batch_decode = batch decode

In [230]:
# 输入二维数据
tokenizer.batch_decode([ids])

['<|begin_of_text|>The quick brown fox jumps over the lazy dog']

In [231]:
tokenizer.batch_decode([ids], skip_special_tokens=True)

['The quick brown fox jumps over the lazy dog']

## tokenizer([sequence])

In [232]:
inputs = tokenizer(
    sequences,                          # 句子
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    max_length = 2048,                  # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应总长度长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[128000,    791,   4062,  14198,  39935,  35308,    927,    279,  16053,
           5679, 128001, 128001],
        [128000, 110260,  15120,  41920,  46091,  64803,  76208, 103070, 103305,
         102397, 103178,  95598]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([12, 12])


In [233]:
tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True), tokenizer.decode(inputs["input_ids"][1], skip_special_tokens=True)

('The quick brown fox jumps over the lazy dog', '零一二三四五六七八九十')

In [234]:
tokenizer.batch_decode(inputs["input_ids"], skip_special_tokens=True)

['The quick brown fox jumps over the lazy dog', '零一二三四五六七八九十']