In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

针对下述data进行tokenizer

In [4]:
data = [
    "The sun is shining brightly today.",
    "She enjoys reading books in the park.",
    "It started raining unexpectedly.",
    "He ran as fast as he could.",
    "This restaurant has amazing food.",
    "We are going on a vacation next week.",
    "Learning new things is always exciting.",
    "Please remember to lock the door before leaving.",
    "They won the first prize in the competition.",
    "Time flies when you're having fun.",
]

In [None]:
# 常用的tokenizer写法: 填充到当前batch的最大长度
tokenizer(data, padding=True, return_tensors="pt")["input_ids"].shape

torch.Size([10, 11])

后续的代码，围绕 `tokenizer.pad` 方法做的展开

tokenizer 不仅可以处理单个字符串，还可以处理字符串列表

In [5]:
# 字符串
tokenizer(data[0])

{'input_ids': [101, 1996, 3103, 2003, 9716, 14224, 2651, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
# 字符串列表
raw_tokens = tokenizer(data)
raw_tokens

{'input_ids': [[101, 1996, 3103, 2003, 9716, 14224, 2651, 1012, 102], [101, 2016, 15646, 3752, 2808, 1999, 1996, 2380, 1012, 102], [101, 2009, 2318, 24057, 14153, 1012, 102], [101, 2002, 2743, 2004, 3435, 2004, 2002, 2071, 1012, 102], [101, 2023, 4825, 2038, 6429, 2833, 1012, 102], [101, 2057, 2024, 2183, 2006, 1037, 10885, 2279, 2733, 1012, 102], [101, 4083, 2047, 2477, 2003, 2467, 10990, 1012, 102], [101, 3531, 3342, 2000, 5843, 1996, 2341, 2077, 2975, 1012, 102], [101, 2027, 2180, 1996, 2034, 3396, 1999, 1996, 2971, 1012, 102], [101, 2051, 10029, 2043, 2017, 1005, 2128, 2383, 4569, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 

不使用 return_tensor 参数，返回的Dict[str, List]类型，其中的列表不会进行填充。
根据下述代码可以发现它们的长度都不一样。

In [8]:
for item in raw_tokens["input_ids"]:
    print(len(item))

9
10
7
10
8
11
9
11
11
11


In [10]:
# 把它们填充到当前batch的最大长度
tokens_pt1 = tokenizer.pad(
    raw_tokens,
    padding=True,
    return_tensors="pt",
)
print(tokens_pt1["input_ids"].shape)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([10, 11])


In [None]:
# 把它们填充到8的倍数
tokens_pt_multiple = tokenizer.pad(
    raw_tokens,
    padding=True,
    pad_to_multiple_of=8,
    return_tensors="pt",
)
print(tokens_pt_multiple["input_ids"].shape)

torch.Size([10, 16])


In [13]:
# 把它们填充到某个固定的最大长度，比如：32。
tokens_pt2 = tokenizer.pad(
    raw_tokens,
    padding="max_length",
    max_length=32,
    return_tensors="pt",
)
print(tokens_pt2["input_ids"].shape)

torch.Size([10, 32])


参考资料：
- [https://github.com/FlagOpen/FlagEmbedding/blob/ca91f2b5d10c062c5e3410e28825a1752f0fdada/FlagEmbedding/abc/finetune/embedder/AbsDataset.py](https://github.com/FlagOpen/FlagEmbedding/blob/ca91f2b5d10c062c5e3410e28825a1752f0fdada/FlagEmbedding/abc/finetune/embedder/AbsDataset.py)