# Training a new tokenizer from an old one

In [1]:
import os

# 设置代理
os.environ['http_proxy'] = 'http://127.0.0.1:7893'
os.environ['https_proxy'] = 'http://127.0.0.1:7893'
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7893'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7893'
os.environ['no_proxy'] = '127.0.0.1,localhost'
os.environ['NO_PROXY'] = '127.0.0.1,localhost'

In [2]:
from datasets import load_dataset

# 使用更新后的数据集名称格式
# The old "code_search_net" format is deprecated, use the new format
# raw_datasets = load_dataset("codeparrot/github-code", "python-all")

# 或者你也可以尝试这个替代方案（更小的数据集）:
raw_datasets = load_dataset("Nan-Do/code-search-net-python", split="train")

In [3]:
raw_datasets


Dataset({
    features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition', 'summary'],
    num_rows: 455243
})

In [4]:
print(raw_datasets[123456]["code"])

def align_and_build_tree(seqs, moltype, best_tree=False, params=None):
    """Returns an alignment and a tree from Sequences object seqs.

    seqs: a cogent.core.alignment.SequenceCollection object, or data that can
    be used to build one.

    moltype: cogent.core.moltype.MolType object

    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Muscle app controller.

    The result will be a tuple containing a cogent.core.alignment.Alignment
    and a cogent.core.tree.PhyloNode object (or None for the alignment
    and/or tree if either fails).
    """
    aln = align_unaligned_seqs(seqs, moltype=moltype, params=params)
    tree = build_tree_from_alignment(aln, moltype, best_tree, params)
    return {'Align':aln, 'Tree':tree}


In [5]:
training_corpus = (
    raw_datasets[i : i + 1000]["code"]
    for i in range(0, len(raw_datasets), 1000)
)

In [6]:
gen = (i for i in range(10))
print(list(gen))
print(list(gen))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[]


In [7]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")



In [8]:
training_corpus

<generator object <genexpr> at 0x7fae7b81f4c0>

In [9]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)






In [10]:
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
tokens

['def',
 'Ġadd',
 '_',
 'n',
 'umbers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`',
 '."',
 '""',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [11]:
tokens = tokenizer.tokenize(example)
tokens

['def',
 'Ġadd',
 '_',
 'numbers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'ĊĠĠĠ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`."""',
 'ĊĠĠĠ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [12]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

27
36


In [13]:
example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """
tokenizer.tokenize(example)

['class',
 'ĠLinear',
 'Layer',
 '():',
 'ĊĠĠĠ',
 'Ġdef',
 'Ġ__',
 'init',
 '__(',
 'self',
 ',',
 'Ġinput',
 '_',
 'size',
 ',',
 'Ġoutput',
 '_',
 'size',
 '):',
 'ĊĠĠĠĠĠĠĠ',
 'Ġself',
 '.',
 'weight',
 'Ġ=',
 'Ġtorch',
 '.',
 'randn',
 '(',
 'input',
 '_',
 'size',
 ',',
 'Ġoutput',
 '_',
 'size',
 ')',
 'ĊĠĠĠĠĠĠĠ',
 'Ġself',
 '.',
 'bias',
 'Ġ=',
 'Ġtorch',
 '.',
 'zeros',
 '(',
 'output',
 '_',
 'size',
 ')',
 'ĊĊĠĠĠ',
 'Ġdef',
 'Ġ__',
 'call',
 '__(',
 'self',
 ',',
 'Ġx',
 '):',
 'ĊĠĠĠĠĠĠĠ',
 'Ġreturn',
 'Ġx',
 'Ġ@',
 'Ġself',
 '.',
 'weights',
 'Ġ+',
 'Ġself',
 '.',
 'bias',
 'ĊĠĠĠĠ']

In [14]:
tokenizer.save_pretrained("code-search-net-tokenizer")

('code-search-net-tokenizer/tokenizer_config.json',
 'code-search-net-tokenizer/special_tokens_map.json',
 'code-search-net-tokenizer/vocab.json',
 'code-search-net-tokenizer/merges.txt',
 'code-search-net-tokenizer/added_tokens.json',
 'code-search-net-tokenizer/tokenizer.json')

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
tokenizer.push_to_hub("code-search-net-tokenizer")

README.md: 0.00B [00:00, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/yiwenX/code-search-net-tokenizer/commit/dd08293e2269d73a1e47736f8b96f7aeddcbc9b9', commit_message='Upload tokenizer', commit_description='', oid='dd08293e2269d73a1e47736f8b96f7aeddcbc9b9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yiwenX/code-search-net-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='yiwenX/code-search-net-tokenizer'), pr_revision=None, pr_num=None)

In [21]:
from huggingface_hub import upload_file


upload_file(
    path_or_fileobj="/root/hf-learning/202509_chapter3/code-search-net-tokenizer/README.md",
    path_in_repo="README.md",
    repo_id="yiwenX/code-search-net-tokenizer",
    repo_type="model",
    commit_message="更新 README 文档"
)

CommitInfo(commit_url='https://huggingface.co/yiwenX/code-search-net-tokenizer/commit/37c862efae6d6da475f90becb8e7c006563030a0', commit_message='更新 README 文档', commit_description='', oid='37c862efae6d6da475f90becb8e7c006563030a0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yiwenX/code-search-net-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='yiwenX/code-search-net-tokenizer'), pr_revision=None, pr_num=None)