In [1]:
import torch
from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers import AutoTokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "openai/clip-vit-base-patch32"
sequence = "The quick brown fox jumps over the lazy dog."
max_length = 20

# CLIPTokenizer

In [4]:
tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version)
tokenizer

CLIPTokenizer(name_or_path='openai/clip-vit-base-patch32', vocab_size=49408, model_max_length=77, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

## special ids and tokens

In [7]:
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)

[49406, 49407, 49407]
['<|startoftext|>', '<|endoftext|>', '<|endoftext|>']


## tokenize(sequence) 对文本进行分词

In [7]:
tokens = tokenizer.tokenize(sequence)
tokens

['the</w>',
 'quick</w>',
 'brown</w>',
 'fox</w>',
 'jumps</w>',
 'over</w>',
 'the</w>',
 'lazy</w>',
 'dog</w>',
 '.</w>']

## convert_tokens_to_ids 将分词后的token映射为数字

In [8]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[518, 3712, 2866, 3240, 18911, 962, 518, 10753, 1929, 269]

## encode = tokenize + convert_tokens_to_ids

In [9]:
print(tokenizer.encode(sequence, add_special_tokens=False))
ids = tokenizer.encode(sequence, add_special_tokens=True)
ids

[518, 3712, 2866, 3240, 18911, 962, 518, 10753, 1929, 269]


[49406, 518, 3712, 2866, 3240, 18911, 962, 518, 10753, 1929, 269, 49407]

In [10]:
# encode前后代表开始和结束
tokenizer.convert_ids_to_tokens([49406, 49407])

['<|startoftext|>', '<|endoftext|>']

## batch_encode_plus = batch encode

In [11]:
tokenizer.batch_encode_plus([sequence], add_special_tokens=True, padding=True)

{'input_ids': [[49406, 518, 3712, 2866, 3240, 18911, 962, 518, 10753, 1929, 269, 49407]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## convert_ids_to_tokens 将数字映射为token

In [12]:
print(tokenizer.convert_ids_to_tokens(ids))
tokens = tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
tokens

['<|startoftext|>', 'the</w>', 'quick</w>', 'brown</w>', 'fox</w>', 'jumps</w>', 'over</w>', 'the</w>', 'lazy</w>', 'dog</w>', '.</w>', '<|endoftext|>']


['the</w>',
 'quick</w>',
 'brown</w>',
 'fox</w>',
 'jumps</w>',
 'over</w>',
 'the</w>',
 'lazy</w>',
 'dog</w>',
 '.</w>']

## convert_tokens_to_string 将token转换为string

In [13]:
tokenizer.convert_tokens_to_string(tokens)

'the quick brown fox jumps over the lazy dog .'

## decode = convert_ids_to_tokens + convert_tokens_to_string

In [14]:
print(tokenizer.decode(ids))
print(tokenizer.decode(ids, skip_special_tokens=True))

<|startoftext|>the quick brown fox jumps over the lazy dog. <|endoftext|>
the quick brown fox jumps over the lazy dog.


## batch_decode = batch decode

In [15]:
print(tokenizer.batch_decode([ids]))
print(tokenizer.batch_decode([ids], skip_special_tokens=True))

['<|startoftext|>the quick brown fox jumps over the lazy dog. <|endoftext|>']
['the quick brown fox jumps over the lazy dog.']


## tokenizer([sequence])

In [16]:
inputs = tokenizer(
    sequence,                           # 单个句子
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应总长度长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[49406,   518,  3712,  2866,  3240, 18911,   962,   518, 10753,  1929,
           269, 49407]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([12], device='cuda:0')


In [17]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id 添加了['<|startoftext|>', '<|endoftext|>'],和encode相同
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'length', 'attention_mask'])
tensor([[49406,   518,  3712,  2866,  3240, 18911,   962,   518, 10753,  1929,
           269, 49407],
        [49406,   518,  3712,  2866,  3240, 18911,   962,   518, 10753,  1929,
           269, 49407]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([12, 12], device='cuda:0')


# CLIPTokenizerFast

In [18]:
tokenizer: CLIPTokenizerFast = CLIPTokenizerFast.from_pretrained(version)
tokenizer

CLIPTokenizerFast(name_or_path='openai/clip-vit-base-patch32', vocab_size=49408, model_max_length=77, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [19]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id 添加了['<|startoftext|>', '<|endoftext|>'],和encode相同
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 返回文字长度时最长长度,结果不对

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[49406,   518,  3712,  2866,  3240, 18911,   962,   518, 10753,  1929,
           269, 49407],
        [49406,   518,  3712,  2866,  3240, 18911,   962,   518, 10753,  1929,
           269, 49407]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([12, 12], device='cuda:0')


# AutoTokenizer 通用封装，根据载入预训练模型来自适应

https://huggingface.co/docs/transformers/main/autoclass_tutorial#autotokenizer

In [6]:
#                                                               默认为Fast,可以不用
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(version, use_fast=False)
tokenizer

CLIPTokenizer(name_or_path='openai/clip-vit-base-patch32', vocab_size=49408, model_max_length=77, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [7]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id 添加了['<|startoftext|>', '<|endoftext|>'],和encode相同
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 返回文字最长长度

dict_keys(['input_ids', 'length', 'attention_mask'])
tensor([[49406,   518,  3712,  2866,  3240, 18911,   962,   518, 10753,  1929,
           269, 49407],
        [49406,   518,  3712,  2866,  3240, 18911,   962,   518, 10753,  1929,
           269, 49407]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([12, 12], device='cuda:0')
