In [1]:
import os
import sentencepiece as spm
# from transformers import LlamaTokenizer
from transformers import AutoTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model

# https://www.cnblogs.com/wangzhilun/p/17727243.html


In [2]:
# 1. 训练新的中文 BPE 词表
spm.SentencePieceTrainer.train(
    input='./news-commentary-v13-zh-en.txt',  # 输入的中文语料文件
    input_format='text',
    model_prefix='chinese_bpe',  # 输出的模型文件前缀
    model_type='bpe',  # 使用 BPE 算法
    vocab_size=10000,  # 生成的词表大小
    character_coverage=0.9995,  # 字符覆盖率
    num_threads=40,  # 训练线程数
    split_digits=True,  # 是否分割数字
    byte_fallback=True,  # 是否启用字节回退
    max_sentence_length=24000  # 最大句子长度
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./news-commentary-v13-zh-en.txt
  input_format: text
  model_prefix: chinese_bpe
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 24000
  num_threads: 40
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differe

In [3]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=411 min_freq=13
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=409 size=6420 all=1697434 active=87227 piece=身上
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=407 size=6440 all=1700109 active=89902 piece=声明
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=407 size=6460 all=1703393 active=93186 piece=▁resolve
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=405 size=6480 all=1706630 active=96423 piece=▁诚然
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=404 size=6500 all=1709336 active=99129 piece=▁truly
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=404 min_freq=13
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=402 size=6520 all=1711799 active=87930 piece=▁alloc
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=400 size=6540 all=1715059 active=91190 piece=▁bear
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=398 size=6560 all=1716602 active=92733 piece=违反
bpe_model_trainer.cc(268) LOG(INFO) Added:

In [4]:
cache_directory = "/root/autodl-tmp"

# 2. 加载 LLaMa 和中文词表
llama_tokenizer_dir = "./llama_tokenizer"  # LLaMa 模型的分词器路径
chinese_sp_model_file = "./chinese_bpe.model"  # 新训练的中文词表路径

# baichuan2是BPE分词的，LLama2也是，但是被墙了下不了，所以用baichuan2代替一下
llama_tokenizer = AutoTokenizer.from_pretrained('baichuan-inc/Baichuan2-7B-Chat', cache_dir=cache_directory)
chinese_sp_model = spm.SentencePieceProcessor()
chinese_sp_model.Load(chinese_sp_model_file)
# llama2这里被墙了 申请的时候不要写china

The repository for baichuan-inc/Baichuan2-7B-Chat contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/baichuan-inc/Baichuan2-7B-Chat.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


True

In [5]:
# 3. 解析 LLaMa 和中文词表的内部结构
llama_spm = sp_pb2_model.ModelProto()
llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())

chinese_spm = sp_pb2_model.ModelProto()
chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto())

# 4. 打印两个词表的大小和特殊 token
print(f"LLaMa tokenizer size: {len(llama_tokenizer)}")
print(f"Chinese tokenizer size: {len(chinese_sp_model)}")
print("LLaMa special tokens:", llama_tokenizer.all_special_tokens)
print("LLaMa special token IDs:", llama_tokenizer.all_special_ids)
print("LLaMa special token map:", llama_tokenizer.special_tokens_map)


LLaMa tokenizer size: 125696
Chinese tokenizer size: 10000
LLaMa special tokens: ['<s>', '</s>', '<unk>']
LLaMa special token IDs: [1, 2, 0]
LLaMa special token map: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}


In [6]:

# 5. 合并词表
llama_spm_tokens_set = set(p.piece for p in llama_spm.pieces)
print(f"Before: {len(llama_spm_tokens_set)}")

for p in chinese_spm.pieces:
    piece = p.piece
    if piece not in llama_spm_tokens_set:
        # print(1)
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = 0
        llama_spm.pieces.append(new_p)

print(f"New model pieces: {len(llama_spm.pieces)}")




Before: 125696
New model pieces: 126211


In [7]:
# 6. 保存合并后的词表
from transformers import  LlamaTokenizer

output_sp_dir = 'merged_tokenizer_sp'
output_hf_dir = 'merged_tokenizer_hf'
os.makedirs(output_sp_dir, exist_ok=True)

with open(output_sp_dir + '/chinese_llama.model', 'wb') as f:
    f.write(llama_spm.SerializeToString())

tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/chinese_llama.model')
tokenizer.save_pretrained(output_hf_dir)

print(f"Chinese-LLaMA tokenizer has been saved to {output_hf_dir}")

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


Chinese-LLaMA tokenizer has been saved to merged_tokenizer_hf


In [8]:

# 7. 测试效果
text_en = "The excellence of a translation can only be judged by noting"
text_cn = "麒麟，是中国古代神话中的一种瑞兽"

print("Test English text:\n", text_en)
print(f"Tokenized by LLaMA tokenizer: {llama_tokenizer.tokenize(text_en)}")
print(f"Tokenized length by LLaMA tokenizer: {len(llama_tokenizer.tokenize(text_en))}")
print(f"Tokenized by Chinese-LLaMA tokenizer: {tokenizer.tokenize(text_en)}")
print(f"Tokenized length by Chinese-LLaMA tokenizer: {len(tokenizer.tokenize(text_en))}")

print("Test Chinese text:\n", text_cn)
print(f"Tokenized by LLaMA tokenizer: {llama_tokenizer.tokenize(text_cn)}")
print(f"Tokenized length by LLaMA tokenizer: {len(llama_tokenizer.tokenize(text_cn))}")
print(f"Tokenized by Chinese-LLaMA tokenizer: {tokenizer.tokenize(text_cn)}")
print(f"Tokenized length by Chinese-LLaMA tokenizer: {len(tokenizer.tokenize(text_cn))}")

Test English text:
 The excellence of a translation can only be judged by noting
Tokenized by LLaMA tokenizer: ['The', '▁excellence', '▁of', '▁a', '▁translation', '▁can', '▁only', '▁be', '▁judged', '▁by', '▁noting']
Tokenized length by LLaMA tokenizer: 11
Tokenized by Chinese-LLaMA tokenizer: ['The', '▁excellence', '▁of', '▁a', '▁translation', '▁can', '▁only', '▁be', '▁judged', '▁by', '▁noting']
Tokenized length by Chinese-LLaMA tokenizer: 11
Test Chinese text:
 麒麟，是中国古代神话中的一种瑞兽
Tokenized by LLaMA tokenizer: ['麒麟', '，', '是中国', '古代', '神话', '中', '的一种', '瑞', '兽']
Tokenized length by LLaMA tokenizer: 9
Tokenized by Chinese-LLaMA tokenizer: ['麒麟', '，', '是中国', '古代', '神话', '中', '的一种', '瑞', '兽']
Tokenized length by Chinese-LLaMA tokenizer: 9


In [9]:
# 测试完成 可以直接使用了
tokenizer_hf = AutoTokenizer.from_pretrained(output_hf_dir)
# 输入文本
text = "这是一个测试句子。"

# 分词
tokens = tokenizer_hf.tokenize(text)
print("Tokens:", tokens)

# 将 tokens 转换为 token IDs
token_ids = tokenizer_hf.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

# 编码文本为输入 ID
encoded_input = tokenizer_hf.encode(text, add_special_tokens=True)
print("Encoded Input:", encoded_input)

# 解码回文本
decoded_text = tokenizer_hf.decode(encoded_input)
print("Decoded Text:", decoded_text)


Tokens: ['▁这是', '一个', '测试', '句子', '。']
Token IDs: [125939, 1558, 4330, 13937, 66]
Encoded Input: [1, 125939, 1558, 4330, 13937, 66]
Decoded Text: <s> 这是一个测试句子。
