In [14]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "5"

import torch
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
    TextIteratorStreamer,
    LlamaForCausalLM,
    GPTNeoXForCausalLM,
)

# gpt_evol koalpaca lima alpaca-gpt4 korquad summary_tech sharegpt_V3_selected sharegpt_gpt4 koen enko

model_path = "/workspaces/data/llm_weights/custom_trained/DIE-MoE-10.7Bx2_sft/"
adapters_path = f"../runs/DIE-MoE-10.7Bx2_dpo/checkpoint-38348/"
save_path = f"/workspaces/data/llm_weights/custom_trained/DIE-MoE-10.7Bx2_dpo_ep2"

print(f"Starting to load the model {model_path} into memory")

# tokenizer = AutoTokenizer.from_pretrained(model_path)

base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map={"": "cpu"},
    # max_memory = {i: '81920MB' for i in range(torch.cuda.device_count())},
    # local_files_only=True
)

first_weight = base_model.model.layers[0].self_attn.q_proj.weight
first_weight_old = first_weight.clone()

lora_model = PeftModel.from_pretrained(
    base_model,
    adapters_path,
    device_map={"": "cpu"},
    torch_dtype=torch.float16,
)

lora_weight = lora_model.base_model.model.model.layers[0].self_attn.q_proj.weight

assert torch.allclose(first_weight_old, first_weight)

# merge weights - new merging method from peft
lora_model = lora_model.merge_and_unload()

lora_model.train(False)

# did we do anything?
assert not torch.allclose(first_weight_old, first_weight)

lora_model_sd = lora_model.state_dict()
deloreanized_sd = {
    k.replace("base_model.model.", ""): v
    for k, v in lora_model_sd.items()
    if "lora" not in k
}

LlamaForCausalLM.save_pretrained(
    base_model, save_path, state_dict=deloreanized_sd, max_shard_size="10GB"
)

# GPTNeoXForCausalLM.save_pretrained(
#     base_model, save_path, state_dict=deloreanized_sd, max_shard_size="10GB"
# )

print(f"Successfully saved the model {save_path}")

# tokenizer 옮기는 코드 추가하기
import shutil

for f_path in [
    "special_tokens_map.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "tokenizer.model",
]:
    src = os.path.join(model_path, f_path)
    dst = os.path.join(save_path, f_path)
    if os.path.exists(src):
        shutil.copy(src, dst)

Starting to load the model /workspaces/data/llm_weights/custom_trained/DIE-MoE-10.7Bx2_sft/ into memory


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.05it/s]


Successfully saved the model /workspaces/data/llm_weights/custom_trained/DIE-MoE-10.7Bx2_dpo_ep2


In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4"

from transformers import AutoTokenizer, AutoModelForCausalLM
    
print("model loading...")
 
# Model & Tokenizer loading
tokenizer = AutoTokenizer.from_pretrained("/data/llm_weights/merged/M-DIE-M-10.7B")
model = AutoModelForCausalLM.from_pretrained("/data/llm_weights/merged/M-DIE-M-10.7B")
 
# Repository 생성 & model upload
REPO_NAME = "M-DIE-M-10.7B" # ex) 'my-bert-fine-tuned'
AUTH_TOKEN = "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" # <https://huggingface.co/settings/token>
 
## Upload to Huggingface Hub
model.push_to_hub(
    REPO_NAME, 
    use_temp_dir=True, 
    use_auth_token=AUTH_TOKEN
)
tokenizer.push_to_hub(
    REPO_NAME, 
    use_temp_dir=True, 
    use_auth_token=AUTH_TOKEN
)

model loading...


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.23s/it]


RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-659e452c-7e3cdac67ffa45fd76b8ddc2;76472165-2277-498b-9efa-8fb286d5d426)

Repository Not Found for url: https://huggingface.co/api/models/M-DIE-M-10.7B.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.

In [71]:
# model_path = '/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.5a-GPTQ/'
model_path = "runs/MingAI-70B-chat-orca_v0.5a-retrained/checkpoint-10/"
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)

In [72]:
tokenizer.padding_side

'right'

In [73]:
tokenizer.pad_token

'<unk>'

In [51]:
tokenizer

LlamaTokenizerFast(name_or_path='/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.4_v0.2_retrained_checkpoint-2//', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False)

In [None]:
model_path = "/disk1/data/llm_weights/llama-2-ko-70b/"
adapters_path = "/home/ados/workspaces/FastChat_train/runs/MingAI-70B-chat-orca_v0.5a/checkpoint-44362"
save_path = (
    "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.5a-checkpoint-44362"
)

print(f"Starting to load the model {model_path} into memory")

# tokenizer = AutoTokenizer.from_pretrained(model_path)

base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map={"": "cpu"},
    # max_memory = {i: '81920MB' for i in range(torch.cuda.device_count())},
    # local_files_only=True
)

first_weight = base_model.model.layers[0].self_attn.q_proj.weight
first_weight_old = first_weight.clone()

lora_model = PeftModel.from_pretrained(
    base_model,
    adapters_path,
    device_map={"": "cpu"},
    torch_dtype=torch.float16,
)

lora_weight = lora_model.base_model.model.model.layers[0].self_attn.q_proj.weight

assert torch.allclose(first_weight_old, first_weight)

# merge weights - new merging method from peft
lora_model = lora_model.merge_and_unload()

lora_model.train(False)

# did we do anything?
assert not torch.allclose(first_weight_old, first_weight)

lora_model_sd = lora_model.state_dict()
deloreanized_sd = {
    k.replace("base_model.model.", ""): v
    for k, v in lora_model_sd.items()
    if "lora" not in k
}

LlamaForCausalLM.save_pretrained(
    base_model, save_path, state_dict=deloreanized_sd, max_shard_size="10GB"
)

# GPTNeoXForCausalLM.save_pretrained(
#     base_model, save_path, state_dict=deloreanized_sd, max_shard_size="10GB"
# )

print(f"Successfully saved the model {save_path}.")

# tokenizer 옮기는 코드 추가하기
import shutil

for f_path in [
    "special_tokens_map.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "tokenizer.model",
]:
    src = os.path.join(model_path, f_path)
    dst = os.path.join(save_path, f_path)
    if os.path.exists(src):
        shutil.copy(src, dst)

Starting to load the model /disk1/data/llm_weights/llama-2-ko-70b/ into memory


Loading checkpoint shards: 100%|██████████| 49/49 [00:19<00:00,  2.57it/s]
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /disk1/data/llm_weights/llama-2-ko-70b/ and are newly initialized: ['model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.41.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.59.self_attn.rotary_emb.inv_freq', 'model.layers.33.self_attn.rotary_emb.inv_freq', 'model.layers.58.self_attn.rotary_emb.inv_freq', 'model.layers.77.self_attn.rotary_emb.inv_freq', 'model.layers.74.self_attn.rotary_emb.inv_freq', 'model.layers.69.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.44.self_attn.rotary_emb.inv_freq', 'model.layers.71.self_attn.rotary_emb.in

Successfully saved the model /disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.5a-checkpoint-44362.


In [18]:
def gen(model, tokenizer, x):
    q = f"### System:\nThis is a system prompt, please behave and help the user.\n\n### User: {x}\n\n### Assistant:"
    # print(q)
    gened = model.generate(
        **tokenizer(q, return_tensors="pt", return_token_type_ids=False),  # .to('cuda')
        max_new_tokens=50,
        early_stopping=True,
        do_sample=True,
        eos_token_id=2,
    )
    print(tokenizer.decode(gened[0]))

In [None]:
gen(lora_model, tokenizer, "건강하게 살기 위한 세 가지 방법은?")

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = "/disk1/data/llm_weights/polyglot-ko-12.8b-pad/"

# base_model = AutoModelForCausalLM.from_pretrained(
#     model_path,
#     load_in_8bit=False,
#     torch_dtype=torch.float16,
#     device_map={"": 'cpu'},
# )

base_tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    device_map={"": "cpu"},
)

In [21]:
base_tokenizer

PreTrainedTokenizerFast(name_or_path='/disk1/data/llm_weights/polyglot-ko-12.8b-pad/', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'unk_token': '<|unused1|>', 'pad_token': '<|unused1|>', 'additional_special_tokens': ['<|endoftext|>', '<|sep|>', '<|acc|>', '<|tel|>', '<|rrn|>']}, clean_up_tokenization_spaces=True)

In [23]:
base_tokenizer.unk_token

'<|unused1|>'

In [22]:
base_tokenizer.pad_token

'<|unused1|>'

In [9]:
model_path = "/disk1/data/llm_weights/Llama-2-13b-hf/"

# llama_model = AutoModelForCausalLM.from_pretrained(
#     model_path,
#     load_in_8bit=False,
#     torch_dtype=torch.float16,
#     device_map={"": 'cpu'},
# )

llama_tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    device_map={"": "cpu"},
)

In [11]:
llama_tokenizer

LlamaTokenizerFast(name_or_path='/disk1/data/llm_weights/Llama-2-13b-hf/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False)}, clean_up_tokenization_spaces=False)

In [15]:
llama_tokenizer.pad_token

Using pad_token, but it is not set yet.


In [16]:
llama_tokenizer.unk_token

'<unk>'