In [1]:
import os

os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["NCCL_P2P_LEVEL"] = "PIX"
os.environ["MAX_JOBS"] = "16"

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"

import json
import random

import torch
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    TextGenerationPipeline,
    LlamaTokenizer,
    PreTrainedTokenizer,
)

from fastchat.modules.gptq_utils.llm_dataset_adapter import get_dataset_adapter

# from utils.llm_dataset_adapter import BaseDatasetAdapter
# from typing import List, Optional
# from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
`AnnotionFormat` is deprecated and will be removed in v4.38. Please use `transformers.image_utils.AnnotationFormat` instead.


In [2]:
def get_dataset(model_config, tokenizer):
    adapter = get_dataset_adapter(
        model_config["data_path"], format=model_config["data_format"].lower()
    )
    dataset = adapter.load_data(
        model_config["data_path"],
        tokenizer,
        n_samples=model_config["n_samples"],
        max_length=model_config["max_length"],
    )  # , padding=False

    return dataset

import time
def run_autogptq(model_config):
    quantize_config = BaseQuantizeConfig(
        bits=model_config["bits"],  # quantize model to 4-bit
        group_size=model_config[
            "group_size"
        ],  # it is recommended to set the value to 128
        desc_act=model_config[
            "desc_act"
        ],  # set to False can significantly speed up inference but the perplexity may slightly bad
        damp_percent=model_config["damp_percent"],
    )

    # load un-quantized model, by default, the model will always be loaded into CPU memory
    tokenizer = AutoTokenizer.from_pretrained(
        model_config["model_path"], local_files_only=True
    )
    model = AutoGPTQForCausalLM.from_pretrained(
        model_config["model_path"],
        quantize_config,
        # use_safetensors=True,
        local_files_only=True,
        # device_map='auto',
        # max_memory={i: "80GIB" for i in range(torch.cuda.device_count())},
        
    )

    tokenizer.pad_token = tokenizer.unk_token  # train_lora.py
    tokenizer.padding_side = model_config["padding_side"]

    dataset = get_dataset(model_config, tokenizer)

    # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
    examples = [
        {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
        for example in dataset
    ]
    start = time.time()
    model.quantize(examples, batch_size=4)
    print("quantization ellapse:", time.time() - start)
    # save quantized model using safetensors
    model.save_quantized(model_config["save_path"])  # , use_safetensors=True
    print(f"Successfully quantized at {model_config['save_path']}.")

In [3]:
%%time

model_config = {
    "model_path": "/workspaces/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.42_2_dpo",
    "data_path": "/workspaces/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/workspaces/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.42_2_dpo-GPTQ",
    "data_format": "orca",
    "padding_side": "right",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
    "damp_percent": 0.1,
}

run_autogptq(model_config)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:19<00:00,  1.32s/it]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 535.28 examples/s]


quantization ellapse: 14438.823736190796
Successfully quantized at /workspaces/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.42_2_dpo-GPTQ.
CPU times: user 5h 8min 46s, sys: 58min 21s, total: 6h 7min 7s
Wall time: 4h 1min 46s


In [4]:
########### automatically moving necessities into output folder 

import json
import os

with open(os.path.join(model_config["save_path"], "config.json"), "r") as json_file:
    data = json.load(json_file)

data["quantization_config"] = {
    "bits": model_config["bits"],
    "group_size": model_config["group_size"],
    "damp_percent": model_config["damp_percent"],
    "desc_act": model_config["desc_act"],
    "sym": True,
    "true_sequential": True,
    "model_name_or_path": None,
    "model_file_base_name": "model",
    "quant_method": "gptq",
}


with open(os.path.join(model_config["save_path"], "config.json"), "w") as json_file:
    json.dump(data, json_file, indent=2)
# vllm auto-gptq 적용하려면 config에 quantize_config 들어가야함
# 위 코드 오류 없으면 함수 내부에 넣기
# tokernizer 옮기는 코드 추가
import shutil

for f_path in [
    "special_tokens_map.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "tokenizer.model",
]:
    src = os.path.join(model_config["model_path"], f_path)
    dst = os.path.join(model_config["save_path"], f_path)
    if os.path.exists(src):
        shutil.copy(src, dst)

# pad_token 2->0 으로 수정 "padding_side": "right" -> MingAI 0.5a에 적용

In [5]:
%%time

model_config = {
    "model_path": "/workspaces/disk0/data/llm_weights/COKAL-ko-v1-70B",
    "data_path": "/workspaces/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/workspaces/data/llm_weights/gptq/COKAL-ko-v1-70B-GPTQ",
    "data_format": "orca",
    "padding_side": "right",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
    "damp_percent": 0.1,
}

run_autogptq(model_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61/61 [01:03<00:00,  1.05s/it]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 612.18 examples/s]


quantization ellapse: 15148.160724639893
Successfully quantized at /workspaces/data/llm_weights/gptq/COKAL-ko-v1-70B-GPTQ.
CPU times: user 5h 17min 54s, sys: 1h 11min 36s, total: 6h 29min 30s
Wall time: 4h 14min 19s


In [6]:
########### automatically moving necessities into output folder 

import json
import os

with open(os.path.join(model_config["save_path"], "config.json"), "r") as json_file:
    data = json.load(json_file)

data["quantization_config"] = {
    "bits": model_config["bits"],
    "group_size": model_config["group_size"],
    "damp_percent": model_config["damp_percent"],
    "desc_act": model_config["desc_act"],
    "sym": True,
    "true_sequential": True,
    "model_name_or_path": None,
    "model_file_base_name": "model",
    "quant_method": "gptq",
}


with open(os.path.join(model_config["save_path"], "config.json"), "w") as json_file:
    json.dump(data, json_file, indent=2)
# vllm auto-gptq 적용하려면 config에 quantize_config 들어가야함
# 위 코드 오류 없으면 함수 내부에 넣기
# tokernizer 옮기는 코드 추가
import shutil

for f_path in [
    "special_tokens_map.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "tokenizer.model",
]:
    src = os.path.join(model_config["model_path"], f_path)
    dst = os.path.join(model_config["save_path"], f_path)
    if os.path.exists(src):
        shutil.copy(src, dst)

# pad_token 2->0 으로 수정 "padding_side": "right" -> MingAI 0.5a에 적용

In [11]:
tokenizer = AutoTokenizer.from_pretrained(
    model_config["model_path"], local_files_only=True
)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"
# dataset = get_dataset(model_config, tokenizer)

In [None]:
text = " 안녕"
tokenizer(
    text,
    padding="max_length",
    max_length=4096,
)

In [None]:
model = AutoGPTQForCausalLM.from_pretrained(
    model_config["model_path"],
    quantize_config,
    # use_safetensors=True,
    local_files_only=True,
    # device_map='auto',
)

In [4]:
%%time

model_config = {
    "model_path": "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.5a-checkpoint-44362",
    "data_path": "/disk1/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.5a-checkpoint-44362-GPTQ",
    "data_format": "orca",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
    "damp_percent": 0.1,
}

run_autogptq(model_config)

import json
import os

with open(os.path.join(model_config["save_path"], "config.json"), "r") as json_file:
    data = json.load(json_file)

data["quantization_config"] = {
    "bits": model_config["bits"],
    "group_size": model_config["group_size"],
    "damp_percent": model_config["damp_percent"],
    "desc_act": model_config["desc_act"],
    "sym": True,
    "true_sequential": True,
    "model_name_or_path": None,
    "model_file_base_name": "model",
    "quant_method": "gptq",
}


with open(os.path.join(model_config["save_path"], "config.json"), "w") as json_file:
    json.dump(data, json_file, indent=2)
# vllm auto-gptq 적용하려면 config에 quantize_config 들어가야함
# 위 코드 오류 없으면 함수 내부에 넣기
# tokernizer 옮기는 코드 추가
import shutil

for f_path in [
    "special_tokens_map.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "tokenizer.model",
]:
    src = os.path.join(model_config["model_path"], f_path)
    dst = os.path.join(model_config["save_path"], f_path)
    if os.path.exists(src):
        shutil.copy(src, dst)

# llama 70b ko 기반모델 pad_token관련 파일 모두 수정.. 2->0 만약된다면..

Loading checkpoint shards: 100%|██████████| 15/15 [02:08<00:00,  8.56s/it]
Map: 100%|██████████| 128/128 [00:00<00:00, 686.12 examples/s]


Successfully quantized at /disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.5a-checkpoint-44362-GPTQ.
CPU times: user 5h 31min 21s, sys: 1h 23min 44s, total: 6h 55min 6s
Wall time: 4h 18min 54s


In [3]:
import json
import os

with open(
    os.path.join(
        "/disk1/data/llm_weights/gptq/Upstage-Llama-2-70B-instruct-v2-GPTQ",
        "config.json",
    ),
    "r",
) as json_file:
    data = json.load(json_file)

In [7]:
data["quantization_config"] = {
    "bits": 4,
    "group_size": -1,
    "damp_percent": 0.01,
    "desc_act": True,
    "sym": True,
    "true_sequential": True,
    "model_name_or_path": None,
    "model_file_base_name": "model",
    "quant_method": "gptq",
}

In [9]:
with open("config.json", "w") as json_file:
    json.dump(data, json_file, indent=2)

In [None]:
%%time
model_config = {
    "model_path": "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.2_2/",
    "data_path": "/disk1/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.2_2-GPTQ",
    "data_format": "orca",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
}

run_autogptq(model_config)

# model_config = {
#     'model_path': "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.2_2/",
#     'data_path': "/disk1/data/llm_datasets/custom/merged_korean_datasets-vicuna-v1.json",
#     'save_path': "/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.2_2-GPTQ",
#     'data_format': "orca",
#     'max_length': 4096,
#     'bits': 4,
#     'group_size': 128,
#     'desc_act': False,
#     'n_samples': 128,
# }

# run_autogptq(model_config)

In [None]:
%%time
model_config = {
    "model_path": "/disk1/data/llm_weights/custom_trained/MingAI-70B-chat-orca_v0.2_Llama2/",
    "data_path": "/disk1/data/llm_datasets/koalpaca/KoAlpaca_v1.1.jsonl",
    "save_path": "/disk1/data/llm_weights/gptq/MingAI-70B-chat-orca_v0.2_Llama2-GPTQ",
    "data_format": "orca",
    "max_length": 4096,
    "bits": 4,
    "group_size": -1,
    "desc_act": True,
    "n_samples": 128,
}

run_autogptq(model_config)