### 加载模型

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "/path/llama-2-7b-hf" # 你模型的位置
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 新的分词器
new_tokenizer = AutoTokenizer.from_pretrained("/path/to/merged_tokenizer_hf") # 你保存分词器的位置
model

###　随机扩充

In [None]:
# 获取原先的embedding
embeddings = model.get_input_embeddings()
print(embeddings)
print(embeddings(torch.LongTensor([31999])))

# 扩充
model.resize_token_embeddings(40114)
new_embeddings = model.get_input_embeddings()
print(new_embeddings)
print(new_embeddings(torch.LongTensor([31999])))

###　均值扩充

In [None]:
# 新增的token和在原来token相对应的字典
token_mapping = {}
for i in range(32000, len(new_tokenizer)):
    # 使用 tokenizer 的 convert_ids_to_tokens 方法将索引转换为对应的 token
    token = new_tokenizer.convert_ids_to_tokens(i)
    # 原来的token
    input_ids = tokenizer(token, return_tensors="pt").input_ids[0]
    # 判断是否为_
    if input_ids[1] == 29871:
        new_input_ids = input_ids[2:]
    else:
        new_input_ids = input_ids[1:]        
    token_mapping[i] = new_input_ids

# 原始输入embedding
embeddings = model.get_input_embeddings()
# 新完全初始化的embedding
new_vocab_size = len(new_tokenizer)
embedding_dim = 4096
new_embedding = torch.nn.Embedding(new_vocab_size, embedding_dim)

# 将现有Embedding层的权重赋值给新的Embedding层的前32000行
num_to_copy = min(new_vocab_size, len(embeddings.weight))
new_embedding.weight.data[:num_to_copy, :] = embeddings.weight.data[:num_to_copy, :]

# 开始新增
for new_token, original_tokens in token_mapping.items():
    original_embeddings = embeddings(original_tokens)
    mean_embedding = torch.mean(original_embeddings, dim=0)
    new_embedding.weight.data[new_token] = mean_embedding

# 更换嵌入层
model.set_input_embeddings(new_embedding)

#### 扩充lm_head

In [None]:
output_size = 32000
new_output_size = 40114
lm_head = model.lm_head
# 新的lm_head
new_lm_head = torch.nn.Linear(in_features=4096, out_features=new_output_size, bias=False)
# 前32000个向量不变
new_lm_head.weight.data[:output_size, :] = lm_head.weight.data[:output_size, :]

# 新增
for new_token, original_tokens in token_mapping.items():
    original = 0
    for i in original_tokens:
        original += lm_head.weight.data[i]
    mean_para = original / len(original_tokens)
    new_lm_head.weight.data[new_token] = mean_para

# 替换模型原来的lm_head
model.lm_head = new_lm_head

# 最后完成了embedding和lm_head替换后，保存模型
model.save_pretrained("llama-2-7b-extent", max_shard_size="8GB")