In [1]:
import os

os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["NCCL_P2P_LEVEL"] = "PIX"
os.environ["MAX_JOBS"] = "16"

os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import json
import random
import numpy as np
import torch
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    TextGenerationPipeline,
    LlamaTokenizer,
    PreTrainedTokenizer,
)

from fastchat.modules.gptq_utils.llm_dataset_adapter import get_dataset_adapter
from fastchat.train.data_modules.sft_dataset import load_sft_dataset, combine_dataset
from fastchat.train.train import LazySupervisedDataset
# from utils.llm_dataset_adapter import BaseDatasetAdapter
# from typing import List, Optional
# from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
`AnnotionFormat` is deprecated and will be removed in v4.38. Please use `transformers.image_utils.AnnotationFormat` instead.


[2024-01-17 10:39:07,197] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)




In [9]:
# load un-quantized model, by default, the model will always be loaded into CPU memory
tokenizer = AutoTokenizer.from_pretrained(
    model_config["model_path"], 
    local_files_only=True,
    model_max_length=model_config["max_length"],
)

tokenizer.pad_token = tokenizer.unk_token  # train_lora.py
tokenizer.padding_side = model_config["padding_side"]

raw_data = load_sft_dataset(model_config["data_path"])
dataset = LazySupervisedDataset(raw_data, tokenizer, model_config["data_format"])

choices = np.random.choice(range(len(dataset)), (model_config["n_samples"],), replace=False,).tolist()
examples = [
    {"input_ids": dataset[idx]["input_ids"], "attention_mask": dataset[idx]["attention_mask"]}
    for idx in choices
]

In [10]:
tokenizer.decode(examples[0]['input_ids'])

'<|im_start|>system\n<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n주어진 명사에 다음 형용사를 할당하십시오: 한 번에 한 단어씩, 요구하는.\n명사: 일, 상사\n형용사: 한 번에 한 단어씩, 요구하는.<|im_end|>\n<|im_start|>assistant\n요구가 많은 일, 요구가 많은 상사.<|im_end|>\n<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|en

In [2]:
import time
def run_autogptq(model_config):
    quantize_config = BaseQuantizeConfig(
        bits=model_config["bits"],  # quantize model to 4-bit
        group_size=model_config[
            "group_size"
        ],  # it is recommended to set the value to 128
        desc_act=model_config[
            "desc_act"
        ],  # set to False can significantly speed up inference but the perplexity may slightly bad
        damp_percent=model_config["damp_percent"],
    )

    # load un-quantized model, by default, the model will always be loaded into CPU memory
    tokenizer = AutoTokenizer.from_pretrained(
        model_config["model_path"], 
        local_files_only=True,
        model_max_length=model_config["max_length"],
    )
    model = AutoGPTQForCausalLM.from_pretrained(
        model_config["model_path"],
        quantize_config,
        # use_safetensors=True,
        local_files_only=True,
        # device_map='auto',
        # max_memory={i: "80GIB" for i in range(torch.cuda.device_count())},
        
    )

    tokenizer.pad_token = tokenizer.unk_token  # train_lora.py
    tokenizer.padding_side = model_config["padding_side"]

    raw_data = load_sft_dataset(model_config["data_path"])
    dataset = LazySupervisedDataset(raw_data, tokenizer, model_config["data_format"])

    choices = np.random.choice(range(len(dataset)), (model_config["n_samples"],), replace=False,).tolist()
    examples = [
        {"input_ids": dataset[idx]["input_ids"], "attention_mask": dataset[idx]["attention_mask"]}
        for idx in choices
    ]
    
    start = time.time()
    model.quantize(examples, batch_size=4)
    print("quantization ellapse:", time.time() - start)
    # save quantized model using safetensors
    model.save_quantized(model_config["save_path"])  # , use_safetensors=True
    print(f"Successfully quantized at {model_config['save_path']}.")

In [8]:
%%time

model_config = {
    "model_path": "/workspaces/disk0/data/llm_weights/MoMo-70B-lora-1.8.4-DPO/",
    # "data_path": "/workspaces/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "data_path": "/data/llm_datasets/custom/ados/sft/ados_msft_v4.json",
    "save_path": "/workspaces/data/llm_weights/gptq/MoMo-70B-lora-1.8.4-DPO-GPTQ",
    "data_format": "qwen",
    "padding_side": "right",
    "max_length": 32768,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
    "damp_percent": 0.1,
}

# run_autogptq(model_config)

CPU times: user 7 µs, sys: 11 µs, total: 18 µs
Wall time: 31.7 µs


In [13]:
base = 100000
(base / 10000.) ** (63 / 64.)

9.646616199111993

In [12]:
alpha = 93.0572040929699
10000 * alpha ** (64 / 63.)

999999.9999999998

In [None]:
########### automatically moving necessities into output folder 

import json
import os

with open(os.path.join(model_config["save_path"], "config.json"), "r") as json_file:
    data = json.load(json_file)

data["quantization_config"] = {
    "bits": model_config["bits"],
    "group_size": model_config["group_size"],
    "damp_percent": model_config["damp_percent"],
    "desc_act": model_config["desc_act"],
    "sym": True,
    "true_sequential": True,
    "model_name_or_path": None,
    "model_file_base_name": "model",
    "quant_method": "gptq",
}


with open(os.path.join(model_config["save_path"], "config.json"), "w") as json_file:
    json.dump(data, json_file, indent=2)
# vllm auto-gptq 적용하려면 config에 quantize_config 들어가야함
# 위 코드 오류 없으면 함수 내부에 넣기
# tokernizer 옮기는 코드 추가
import shutil

for f_path in [
    "special_tokens_map.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "tokenizer.model",
]:
    src = os.path.join(model_config["model_path"], f_path)
    dst = os.path.join(model_config["save_path"], f_path)
    if os.path.exists(src):
        shutil.copy(src, dst)

# pad_token 2->0 으로 수정 "padding_side": "right" -> MingAI 0.5a에 적용

In [None]:
%%time

model_config = {
    "model_path": "/workspaces/disk0/data/llm_weights/COKAL-ko-v1-70B",
    "data_path": "/workspaces/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/workspaces/data/llm_weights/gptq/COKAL-ko-v1-70B-GPTQ",
    "data_format": "orca",
    "padding_side": "right",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
    "damp_percent": 0.1,
}

run_autogptq(model_config)

In [None]:
########### automatically moving necessities into output folder 

import json
import os

with open(os.path.join(model_config["save_path"], "config.json"), "r") as json_file:
    data = json.load(json_file)

data["quantization_config"] = {
    "bits": model_config["bits"],
    "group_size": model_config["group_size"],
    "damp_percent": model_config["damp_percent"],
    "desc_act": model_config["desc_act"],
    "sym": True,
    "true_sequential": True,
    "model_name_or_path": None,
    "model_file_base_name": "model",
    "quant_method": "gptq",
}


with open(os.path.join(model_config["save_path"], "config.json"), "w") as json_file:
    json.dump(data, json_file, indent=2)
# vllm auto-gptq 적용하려면 config에 quantize_config 들어가야함
# 위 코드 오류 없으면 함수 내부에 넣기
# tokernizer 옮기는 코드 추가
import shutil

for f_path in [
    "special_tokens_map.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "tokenizer.model",
]:
    src = os.path.join(model_config["model_path"], f_path)
    dst = os.path.join(model_config["save_path"], f_path)
    if os.path.exists(src):
        shutil.copy(src, dst)

# pad_token 2->0 으로 수정 "padding_side": "right" -> MingAI 0.5a에 적용

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_config["model_path"], local_files_only=True
)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"
# dataset = get_dataset(model_config, tokenizer)

In [None]:
text = " 안녕"
tokenizer(
    text,
    padding="max_length",
    max_length=4096,
)

In [None]:
model = AutoGPTQForCausalLM.from_pretrained(
    model_config["model_path"],
    quantize_config,
    # use_safetensors=True,
    local_files_only=True,
    # device_map='auto',
)

In [None]:
%%time

model_config = {
    "model_path": "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.5a-checkpoint-44362",
    "data_path": "/disk1/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.5a-checkpoint-44362-GPTQ",
    "data_format": "orca",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
    "damp_percent": 0.1,
}

run_autogptq(model_config)

import json
import os

with open(os.path.join(model_config["save_path"], "config.json"), "r") as json_file:
    data = json.load(json_file)

data["quantization_config"] = {
    "bits": model_config["bits"],
    "group_size": model_config["group_size"],
    "damp_percent": model_config["damp_percent"],
    "desc_act": model_config["desc_act"],
    "sym": True,
    "true_sequential": True,
    "model_name_or_path": None,
    "model_file_base_name": "model",
    "quant_method": "gptq",
}


with open(os.path.join(model_config["save_path"], "config.json"), "w") as json_file:
    json.dump(data, json_file, indent=2)
# vllm auto-gptq 적용하려면 config에 quantize_config 들어가야함
# 위 코드 오류 없으면 함수 내부에 넣기
# tokernizer 옮기는 코드 추가
import shutil

for f_path in [
    "special_tokens_map.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "tokenizer.model",
]:
    src = os.path.join(model_config["model_path"], f_path)
    dst = os.path.join(model_config["save_path"], f_path)
    if os.path.exists(src):
        shutil.copy(src, dst)

# llama 70b ko 기반모델 pad_token관련 파일 모두 수정.. 2->0 만약된다면..

In [None]:
import json
import os

with open(
    os.path.join(
        "/disk1/data/llm_weights/gptq/Upstage-Llama-2-70B-instruct-v2-GPTQ",
        "config.json",
    ),
    "r",
) as json_file:
    data = json.load(json_file)

In [None]:
data["quantization_config"] = {
    "bits": 4,
    "group_size": -1,
    "damp_percent": 0.01,
    "desc_act": True,
    "sym": True,
    "true_sequential": True,
    "model_name_or_path": None,
    "model_file_base_name": "model",
    "quant_method": "gptq",
}

In [None]:
with open("config.json", "w") as json_file:
    json.dump(data, json_file, indent=2)

In [None]:
%%time
model_config = {
    "model_path": "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.2_2/",
    "data_path": "/disk1/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.2_2-GPTQ",
    "data_format": "orca",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
}

run_autogptq(model_config)

# model_config = {
#     'model_path': "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.2_2/",
#     'data_path': "/disk1/data/llm_datasets/custom/merged_korean_datasets-vicuna-v1.json",
#     'save_path': "/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.2_2-GPTQ",
#     'data_format': "orca",
#     'max_length': 4096,
#     'bits': 4,
#     'group_size': 128,
#     'desc_act': False,
#     'n_samples': 128,
# }

# run_autogptq(model_config)

In [None]:
%%time
model_config = {
    "model_path": "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.2_Llama2/",
    "data_path": "/disk1/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.2_Llama2-GPTQ",
    "data_format": "orca",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
}

run_autogptq(model_config)