# 自己数据集内训练新的tokenizer

In [1]:
# 加载数据
from datasets import load_dataset
raw_datasets = load_dataset("code_search_net", "python")

In [2]:
print(raw_datasets['train'][:1]['whole_func_string'])

['def addidsuffix(self, idsuffix, recursive = True):\n        """Appends a suffix to this element\'s ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""\n        if self.id: self.id += idsuffix\n        if recursive:\n            for e in self:\n                try:\n                    e.addidsuffix(idsuffix, recursive)\n                except Exception:\n                    pass']


In [3]:
# python的加载器，使用才会加载，所以占用内存少
test = [i for i in range(10)]  # 中括号是转为了list
print(test)
# python的加载器
test = (i for i in range(10))
print(test)
print(next(test))
print(next(test))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
<generator object <genexpr> at 0x120d7bac0>
0
1


In [4]:
# 文本加载器 但是加载器只能用一次
training_corpus = (
    raw_datasets["train"][i : i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000)
)
# 生成文本加载器方法
def get_training_corpus():
    return (
        raw_datasets["train"][i : i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000)
    )
# 也可以使用yield来获得底层控制
def get_training_corpus():
    train = raw_datasets['train']
    for i in range(0,len(raw_datasets),1000):
        tmp = train[i:i+1000]
        yield tmp['whole_func_string']

test = get_training_corpus()
print(next(test)[999])

def brpoplpush(self, source, destination, timeout=0):
        """Emulate brpoplpush"""
        transfer_item = self.brpop(source, timeout)
        if transfer_item is None:
            return None

        key, val = transfer_item
        self.lpush(destination, val)
        return val


## 训练新的tokenizer


In [5]:
from transformers import AutoTokenizer

# 下载gpt2的tokenizer
old_tokenizer = AutoTokenizer.from_pretrained('gpt2')



In [6]:
# gpt2的tokenizer是没有对python代码分词的能力的
# 看看效果
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
tokens

['def',
 'Ġadd',
 '_',
 'n',
 'umbers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`',
 '."',
 '""',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [7]:
# 训练新的tokenizer
training_corpus = get_training_corpus()
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus,52000)






In [8]:
tokenizer.tokenize(example)

['def',
 'Ġadd',
 '_',
 'num',
 'bers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'ĊĠĠĠ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`."""',
 'ĊĠĠĠ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [9]:
test = tokenizer(example)
test.tokens()

['def',
 'Ġadd',
 '_',
 'num',
 'bers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'ĊĠĠĠ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`."""',
 'ĊĠĠĠ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [10]:
# 保存训练好的tokenizer
tokenizer.save_pretrained('./my_tokenizer')

('./my_tokenizer/tokenizer_config.json',
 './my_tokenizer/special_tokens_map.json',
 './my_tokenizer/vocab.json',
 './my_tokenizer/merges.txt',
 './my_tokenizer/added_tokens.json',
 './my_tokenizer/tokenizer.json')

# 快速tokenizer和慢速

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [12]:
print(encoding.is_fast)
print(encoding.tokens())

True
['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']


In [None]:
import transformers
print(transformers.__version__)

# !pip install transformers==4.25.1

4.25.1
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
from transformers import pipeline

token_classifier = pipeline("token-classification")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[{'entity': 'I-PER',
  'score': np.float32(0.99938285),
  'index': 4,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': np.float32(0.99815494),
  'index': 5,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': np.float32(0.99590707),
  'index': 6,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': np.float32(0.99923277),
  'index': 7,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': np.float32(0.9738932),
  'index': 12,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': np.float32(0.976115),
  'index': 13,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': np.float32(0.9887977),
  'index': 14,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': np.float32(0.9932106),
  'index': 16,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [15]:
token_classifier = pipeline("token-classification",aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'entity_group': 'PER',
  'score': np.float32(0.9981694),
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': np.float32(0.9796019),
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': np.float32(0.9932106),
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [16]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)

In [21]:
print(inputs['input_ids'].shape)
print(outputs.logits.shape)

torch.Size([1, 19])
torch.Size([1, 19, 9])


In [65]:
import torch
# res = torch.argmax(outputs.logits,dim=-1)
# res
probabilities = torch.nn.functional.softmax(outputs.logits,dim=-1)[0].tolist()
predictions = torch.argmax(outputs.logits,dim=-1)[0].tolist()
predictions

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]

In [66]:
model.config.id2label


{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [69]:
# 手动处理预测的结果
results = []
for idx,pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label!='0':
        results.append(           
                {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]}
             )
results

[{'entity': 'O', 'score': 0.9994322657585144, 'word': 'def'},
 {'entity': 'O', 'score': 0.9989631175994873, 'word': 'Ġadd'},
 {'entity': 'O', 'score': 0.999708354473114, 'word': '_'},
 {'entity': 'O', 'score': 0.9998350143432617, 'word': 'n'},
 {'entity': 'I-PER', 'score': 0.9993828535079956, 'word': 'umbers'},
 {'entity': 'I-PER', 'score': 0.9981549382209778, 'word': '('},
 {'entity': 'I-PER', 'score': 0.995907187461853, 'word': 'a'},
 {'entity': 'I-PER', 'score': 0.9992326498031616, 'word': ','},
 {'entity': 'O', 'score': 0.999804675579071, 'word': 'Ġb'},
 {'entity': 'O', 'score': 0.9995046854019165, 'word': '):'},
 {'entity': 'O', 'score': 0.9996776580810547, 'word': 'Ċ'},
 {'entity': 'O', 'score': 0.999434769153595, 'word': 'Ġ'},
 {'entity': 'I-ORG', 'score': 0.9738931059837341, 'word': 'Ġ'},
 {'entity': 'I-ORG', 'score': 0.9761149883270264, 'word': 'Ġ'},
 {'entity': 'I-ORG', 'score': 0.9887974858283997, 'word': 'Ġ"""'},
 {'entity': 'O', 'score': 0.9995326995849609, 'word': 'Add'},

# tokenize分词算法

### BPE算法

BPE 训练首先计算语料库中使用的唯一单词集合

In [1]:
# 定义语料库
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")



In [3]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(corpus[0])

[('This', (0, 4)),
 ('Ġis', (4, 7)),
 ('Ġthe', (7, 11)),
 ('ĠHugging', (11, 19)),
 ('ĠFace', (19, 24)),
 ('ĠCourse', (24, 31)),
 ('.', (31, 32))]

In [4]:
# 统计语料库的词频
from collections import defaultdict
word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word,offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1
print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})


In [5]:
# 计算基础词汇表，初始时只有单个字符
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()
print(alphabet)

[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [6]:
vocab = ["<|endoftext|>"] + alphabet.copy()
# vocab = ["<|endoftext|>"] + alphabet.copy()

In [10]:
# 将每个单词拆分为单独的字符，以便能够开始训练
splits = {word:[l for l in word] for word in word_freqs.keys()}
splits

{'This': ['T', 'h', 'i', 's'],
 'Ġis': ['Ġ', 'i', 's'],
 'Ġthe': ['Ġ', 't', 'h', 'e'],
 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],
 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
 '.': ['.'],
 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],
 'Ġtokenization': ['Ġ',
  't',
  'o',
  'k',
  'e',
  'n',
  'i',
  'z',
  'a',
  't',
  'i',
  'o',
  'n'],
 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],
 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
 ',': [','],
 'Ġyou': ['Ġ', 'y', 'o', 'u'],
 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],
 'Ġbe': ['Ġ', 'b', 'e'],
 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],
 'Ġto': ['Ġ', 't', 'o'],
 'Ġunderstand': ['Ġ', 'u', 'n'

In [13]:
# 编写一个函数来计算每对字符的频率
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word,freq in word_freqs.items():
        split = splits[word]
        if len(split)==1:
            continue
        for i in range(len(split)-1):
            pair = (split[i],split[i+1])
            pair_freqs[pair] += freq
    return pair_freqs
# 第一次合并后结果
pair_freqs = compute_pair_freqs(splits)
for i,key in enumerate(pair_freqs.keys()):
    print(f'{key}:{pair_freqs[key]}')
    if i>=5:
        break

('T', 'h'):3
('h', 'i'):3
('i', 's'):5
('Ġ', 'i'):2
('Ġ', 't'):7
('t', 'h'):3


In [16]:
# 一个简单的循环就可以找到出现频率最高的对
best_pair = ""
max_freq = None

for pair,freq in pair_freqs.items():
    if max_freq==None or max_freq<freq:
        best_pair = pair
        max_freq = freq
    
print(best_pair)
print(max_freq)

('Ġ', 't')
7


In [21]:
# 出现频率最高的合并结果录入词汇表
merges = {best_pair:best_pair[0]+best_pair[1]}
print(merges)
vocab.append(best_pair[0]+best_pair[1])

{('Ġ', 't'): 'Ġt'}


In [25]:
# splits 字典中进行这个合并
print(['a'+'b']+['c'])
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

['ab', 'c']


In [26]:
splits = merge_pair(best_pair[0],best_pair[1],splits)
print(splits["Ġtrained"]) # 可以看到Ġt确实合并了

['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']


In [30]:
# 现在我们有了我们需要的所有代码，可以循环直到我们学习到我们想要的所有合并。让我们把目标词汇表的大小设定为 50 到50之后就不再合并
vocab = ["<|endoftext|>"] + alphabet.copy()
splits = {word:[l for l in word] for word in word_freqs.keys()}

while len(vocab)<50:
    pair_freqs = compute_pair_freqs(splits)
    # 找到合并的两个词对
    best_pair = ""
    max_freq = None
    for pair,freq in pair_freqs.items():
        if max_freq==None or max_freq<freq:
            best_pair = pair
            max_freq = freq
    # 合并
    merges[best_pair] = best_pair[0]+best_pair[1]
    vocab.append(best_pair[0]+best_pair[1])
    splits = merge_pair(*best_pair,splits)

print(merges)


{('Ġ', 't'): 'Ġt', ('i', 's'): 'is', ('e', 'r'): 'er', ('Ġ', 'a'): 'Ġa', ('Ġt', 'o'): 'Ġto', ('e', 'n'): 'en', ('T', 'h'): 'Th', ('Th', 'is'): 'This', ('o', 'u'): 'ou', ('s', 'e'): 'se', ('Ġto', 'k'): 'Ġtok', ('Ġtok', 'en'): 'Ġtoken', ('n', 'd'): 'nd', ('Ġ', 'is'): 'Ġis', ('Ġt', 'h'): 'Ġth', ('Ġth', 'e'): 'Ġthe', ('i', 'n'): 'in', ('Ġa', 'b'): 'Ġab', ('Ġtoken', 'i'): 'Ġtokeni', ('Ġtokeni', 'z'): 'Ġtokeniz'}


In [31]:
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt', 'is', 'er', 'Ġa', 'Ġto', 'en', 'Th', 'This', 'ou', 'se', 'Ġtok', 'Ġtoken', 'nd', 'Ġis', 'Ġth', 'Ġthe', 'in', 'Ġab', 'Ġtokeni']


In [33]:
# 现在有了字典，根据字典对新文本进行分词
def my_tokenize(text):
    pre_tokenize_result = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word,offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair,merge in merges.items():
        for idx,split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split
    return sum(splits,[])

# 测试分词
my_tokenize("This is not a token.")

['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']