In [1]:
import os
import json
import torch
import base64
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from utils import make_context

from langdetect import detect as langdetect
from langdetect import DetectorFactory
# 确保检测结果的一致性
DetectorFactory.seed = 0

In [None]:
########## Prepare Dataset ###########
data_path = "/home/z00533370/projects/VLMEvalKit/raw_data/"

def get_text_list(folder_path):
    query_list = []
    response_list = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('json'):
            file = json.load(open(os.path.join(folder_path, file_name)))
            query_list.append(file['query'])
            response_list.append(file['response'])
    return query_list, response_list

query_list, response_list = get_text_list(data_path)

In [None]:
########## Count Token Freqs ###########
model_path = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197/"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
token_counts = [0 for _ in range(151936)]
assert len(query_list) == len(response_list)
for i in tqdm(range(len(query_list))):
    query, response = query_list[i], response_list[i]
    _, context_tokens = make_context(tokenizer, query, history=[], system="You are a helpful assistant.")
    for token in context_tokens:
        token_counts[token] += 1
    response_tokens = tokenizer.encode(response)
    for token in response_tokens:
        token_counts[token] += 1

In [None]:
########## update multi-byte to count to sub-token ###########
tiktoken_bpe_file = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197/qwen.tiktoken"
with open(tiktoken_bpe_file, "rb") as f:
    contents = f.read()
old_token_list = [base64.b64decode(token) for token, rank in (line.split() for line in contents.splitlines() if line)]
old_bytes_list = [token for token, rank in (line.split() for line in contents.splitlines() if line)]

In [None]:
# generate a list to calculate inherit_counts
inherit_counts = [0 for _ in range(151936)]
for i in tqdm(range(len(old_token_list))):
    t_count = token_counts[i]
    b_len = len(old_token_list[i])
    if t_count > 0 and b_len > 1:
        for j in range(1, b_len):
            for k in range(b_len+1-j):
                sub_token = old_token_list[i][k:j+k]
                try:
                    inherit_counts[old_token_list.index(sub_token)] += t_count
                except:
                    pass

In [None]:
########## Dictionary Pruning ###########
def is_special_token(token):
    return ((token.startswith('<') and token.endswith('>') and len(token) > 2) or
            (token.startswith('[') and token.endswith(']') and len(token) > 2))

new_token_list = []
new_bytes_list = []
mapping_new2old = []
# detect language, only keep english and chinese
for i in tqdm(range(len(old_token_list))):
    token = old_token_list[i]
    try:
        # number and symbols cannot be detected by langdetect
        token_str = token.decode("utf-8")
        if (langdetect(token_str) in ['zh-cn', 'en']) or (token_counts[i] + inherit_counts[i] > 0) or is_special_token(token_str):
        #if (token_counts[i] + inherit_counts[i] > 0) or is_special_token(token_str):
        #if (token_counts[i] > 0) or is_special_token(token_str):
            new_token_list.append(token)
            new_bytes_list.append(old_bytes_list[i])
            mapping_new2old.append(i)
    except:
        new_token_list.append(token)
        new_bytes_list.append(old_bytes_list[i])
        mapping_new2old.append(i)

In [None]:
########## Add Special Token Mapping ###########
qwen_vocab_size = 151936
for i in range(len(old_token_list), qwen_vocab_size):
    mapping_new2old.append(i)

In [None]:
len(mapping_new2old)

In [None]:
########## Save New Vocab ###########
new_tiktoken_bpe_file = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197-new-vocab/qwen.tiktoken"
vocab_mapping_file = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197-new-vocab/token_vocab_mapping.torch"
# saving new tiktoken_bpe_file
with open(new_tiktoken_bpe_file, "w", encoding="utf8") as w:
    for i, token in enumerate(new_token_list):
        line = base64.b64encode(token).decode("utf8") + " " + str(i) + "\n"
        w.write(line)
# saving mapping index
torch.save(torch.LongTensor(mapping_new2old), vocab_mapping_file)

In [None]:
########## update model ###########
old_model_path = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197/"
new_model_path = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197-new-vocab/"
model = AutoModelForCausalLM.from_pretrained(old_model_path, trust_remote_code=True)
# define new module
new_embeds = torch.nn.Embedding(len(mapping_new2old), model.config.hidden_size, dtype=model.transformer.wte.weight.dtype)
new_lm_head = torch.nn.Linear(model.config.hidden_size, len(mapping_new2old), bias=False, dtype=model.lm_head.weight.dtype)
# get new module parameter from the old
assert len(set(mapping_new2old)) == len(mapping_new2old)
new_embeds.weight.data = model.transformer.wte.weight.data[torch.LongTensor(mapping_new2old, device=model.device)]
new_lm_head.weight.data = model.lm_head.weight.data[torch.LongTensor(mapping_new2old, device=model.device)]
# update model
model.transformer.wte.weight = new_embeds.weight
model.lm_head.weight = new_lm_head.weight
model.transformer.wte.num_embeddings = len(mapping_new2old)
model.lm_head.out_features = len(mapping_new2old)
# update config
model.config.__dict__['vocab_size'] = len(mapping_new2old)
model.config.__dict__['_name_or_path'] = new_model_path
model.config.__dict__['visual']["image_start_id"] = mapping_new2old.index(model.config.__dict__['visual']["image_start_id"])
model.generation_config.__dict__['eos_token_id'] = mapping_new2old.index(model.generation_config.__dict__['eos_token_id'])
model.generation_config.__dict__['pad_token_id'] = mapping_new2old.index(model.generation_config.__dict__['pad_token_id'])
# save new model
model.save_pretrained(new_model_path)

In [None]:
old_model_path = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197/"
new_model_path = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197-new-vocab/"

old_model = AutoModelForCausalLM.from_pretrained(old_model_path, trust_remote_code=True).eval()
new_model = AutoModelForCausalLM.from_pretrained(new_model_path, trust_remote_code=True).eval()

old_tokenizer = AutoTokenizer.from_pretrained(old_model_path, trust_remote_code=True)
new_tokenizer = AutoTokenizer.from_pretrained(new_model_path, trust_remote_code=True)

old_model = old_model.cuda(5)
new_model = new_model.cuda(5)

In [None]:
_, old_context_tokens = make_context(old_tokenizer, query_list[0], history=[], system="You are a helpful assistant.")
_, new_context_tokens = make_context(new_tokenizer, query_list[0], history=[], system="You are a helpful assistant.")

old_input_ids = torch.tensor([old_context_tokens]).to(old_model.device)
new_input_ids = torch.tensor([new_context_tokens]).to(new_model.device)

old_output = old_model(old_input_ids, output_hidden_states=True)
new_output = new_model(new_input_ids, output_hidden_states=True)

old_output_2 = old_model(old_input_ids, output_hidden_states=True)
new_output_2 = new_model(new_input_ids, output_hidden_states=True)
#old_result = old_model.chat(old_tokenizer, query=query_list[0], history=None)
#new_result = new_model.chat(new_tokenizer, query=query_list[0], history=None)

In [None]:
(old_model.transformer.wte(old_input_ids) == new_model.transformer.wte(new_input_ids)).all()

In [None]:
(old_output.hidden_states[-1] == new_output.hidden_states[-1]).all()

In [None]:
(old_output.logits.max(-1)[0] == new_output.logits.max(-1)[0]).all()

In [None]:
old_output.logits.max(-1)[1]

In [None]:
new_output.logits.max(-1)[1]

In [None]:
mapping_new2old.index(136992)

In [None]:
old_model_path = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197/"
old_tokenizer = AutoTokenizer.from_pretrained(old_model_path, trust_remote_code=True)

new_model_path = "/home/z00533370/projects/MoH/exp0731_qwenvl_chat_moh_layer16-31_sigmoid_prob_no_norm/checkpoint-5197-new-vocab/"
new_tokenizer = AutoTokenizer.from_pretrained(new_model_path, trust_remote_code=True)

In [None]:
for i in range(len(query_list)):
    item = query_list[i]
    old_result = old_tokenizer.encode(item)
    new_result = new_tokenizer.encode(item)
    new_result = [mapping_new2old[i] for i in new_result]
    
    if len(old_result) != len(new_result):
        print(i)
    
    for old_token, new_token in zip(old_result, new_result):
        if old_token != new_token:
            print(i)
            continue

In [None]:
old_result = old_tokenizer.encode(query_list[26])
print(len(old_result))

new_result = new_tokenizer.encode(query_list[26])
new_result = [mapping_new2old[i] for i in new_result]
print(len(new_result))

In [None]:
for i in range(len(new_result)):
    if old_result[i] != new_result[i]:
        print(i)
        break

In [None]:
old_result[344:350]

In [None]:
new_result[344:350]

In [None]:
old_token_list[136992]

In [None]:
old_token_list[291], old_token_list[70], old_token_list[288]

In [None]:
new_token_list.index(b'ges')

In [None]:
b_len = len(old_token_list[16900])
sub_token_list = []
for j in range(1, b_len):
    for k in range(b_len+1-j):
        sub_token = old_token_list[16900][k:j+k]
        sub_token_list.append(sub_token)
sub_token_list

In [None]:
old_tokenizer.decode([136992])

In [None]:
b'aXJ0' in old_bytes_list

In [None]:
b'aXJ0' in new_bytes_list

In [None]:
old_bytes_list[44904]
#old_token_list[44904], old_token_list[30942]

In [None]:
[mapping_new2old[i] for i in result[294:]]

In [None]:
a = {}
a[('c', 'd')] = 1
a[('c', 'd')] += 1
a[('c', 'd')]

In [None]:
old_bytes_list[404], old_bytes_list[665], old_bytes_list[268]
old_token_list[404], old_token_list[665], old_token_list[268]

In [None]:
for i, item in enumerate([mapping_new2old[i] for i in result]):
    if old_result[i] != item:
        print(i)

In [None]:
count = 0
for i, item in enumerate(query_list):
    if len(old_tokenizer.encode(item)) != len(tokenizer.encode(item)):
        print(i)
print(count)

In [None]:
len(tokenizer)

In [None]:
tokenizer.encode("""Hint: People can use the engineering-design process to develop solutions to problems. One step in the process is testing if a potential solution meets the requirements of the design.
The passage below describes how the engineering-design process was used to test a solution to a problem. Read the passage. Then answer the question below.

Devin was a mechanical engineer who was designing  to record temperature, precipitation, and wind speed. The weather station would be used in a town where the highest recorded temperature was 40¬∞C. Devin wanted to make sure the weather station would work even in unusually warm weather.
So, he set an indoor test chamber to 50¬∞C with low moisture and no wind. He left the weather station in the chamber overnight. The next day, he checked to see if the weather station displayed accurate measurements after 24 hours at 50¬∞C.
Figure: a weather station.
Question: Which of the following could Devin's test show?
Options:
A. if the weather station would work when the temperature was 50¬∞C
B. how well the weather station would work when it was windy
Please select the correct answer from the options above.")

In [None]:
for i, item in enumerate(old_token_list):
    try:
        #print(i, item.decode('utf-8'))
        item.decode('utf-8')
    except:
        print(i, item)

In [None]:
torch.save(token_counts, 'token_counts.torch')

In [None]:
with open("token_counts.txt", "w") as file:
    for item in token_counts:
        file.write(f"{item}\n")