In [27]:
from huggingface_hub import hf_hub_download
model_name = "kharinaev/Llama-3.1-8B-Instruct-quip"
weights_loc = hf_hub_download(model_name, 'pytorch_model.bin')
weights_loc

'/root/.cache/huggingface/hub/models--kharinaev--Llama-3.1-8B-Instruct-quip/snapshots/696a3fa489a5db8e27cbd42fc5a55da249ee7ee0/pytorch_model.bin'

In [28]:
!ls -lha /root/.cache/huggingface/hub/models--kharinaev--Llama-3.1-8B-Instruct-quip/snapshots/696a3fa489a5db8e27cbd42fc5a55da249ee7ee0/

total 12K
drwxr-xr-x 2 root root 4.0K Aug 27 15:20 .
drwxr-xr-x 4 root root 4.0K Aug 26 19:26 ..
lrwxrwxrwx 1 root root   52 Aug 27 15:20 .gitattributes -> ../../blobs/a6344aac8c09253b3b630fb776ae94478aa0275b
lrwxrwxrwx 1 root root   52 Aug 26 19:26 config.json -> ../../blobs/f65b8d1f6fcb8d5c0bf9dcccd022b43cbc2e190e
lrwxrwxrwx 1 root root   52 Aug 26 19:26 generation_config.json -> ../../blobs/cc7276afd599de091142c6ed3005faf8a74aa257
lrwxrwxrwx 1 root root   76 Aug 26 20:25 pytorch_model.bin -> ../../blobs/988372232fb19988fbfb6873f11f14cb974d289fb0d4aa8ecfe6f5106304fb37
lrwxrwxrwx 1 root root   52 Aug 27 15:20 quantization_config.json -> ../../blobs/a5f43489371e0d69f188522bd04f82477342f6c8
lrwxrwxrwx 1 root root   52 Aug 26 19:26 special_tokens_map.json -> ../../blobs/02ee80b6196926a5ad790a004d9efd6ab1ba6542
lrwxrwxrwx 1 root root   52 Aug 26 19:26 tokenizer.json -> ../../blobs/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8
lrwxrwxrwx 1 root root   52 Aug 26 19:26 tokenizer_config.json -> ..

In [21]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [None]:
from typing import List, Dict, Any, Optional, Union

import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.attention import Attention, AttentionMetadata
from vllm.sequence import IntermediateTensors, SamplerOutput

from accelerate import (
    Accelerator,
    cpu_offload_with_hook,
    init_empty_weights,
    load_checkpoint_and_dispatch
)

In [None]:
def replace_with_quant_layers(module, name=''):
    for child_name, child in module.named_children():
        full_name = f"{name}.{child_name}" if name else child_name
        
        if isinstance(child, torch.nn.Linear):
            setattr(module, child_name, QuantLinear(
                in_features=child.in_features,
                out_features=child.out_features,
                bias=child.bias is not None,
                codebook=E8P12RVQ4B_codebook()
            ))
            print(f"Replaced {full_name} with QuantLinear")
        else:
            replace_with_quant_layers(child, full_name)

def quantize_model(model):
    # Замена линейных слоев на квантизованные
    replace_with_quant_layers(model)
    
    # Замена LlamaAttention на LlamaSdpaAttention
    for layer in model.model.layers:
        old_attn = layer.self_attn
        layer.self_attn = LlamaSdpaAttention(
            hidden_size=old_attn.hidden_size,
            num_heads=old_attn.num_heads
        )
        print("Replaced LlamaAttention with LlamaSdpaAttention")
    
    # Замена VocabParallelEmbedding на обычный Embedding
    model.model.embed_tokens = torch.nn.Embedding(
        num_embeddings=model.model.embed_tokens.num_embeddings,
        embedding_dim=model.model.embed_tokens.embedding_dim
    )
    print("Replaced VocabParallelEmbedding with Embedding")
    
    # Замена ParallelLMHead на обычный Linear
    model.lm_head = torch.nn.Linear(
        in_features=model.lm_head.weight.shape[1],
        out_features=model.lm_head.weight.shape[0],
        bias=False
    )
    print("Replaced ParallelLMHead with Linear")
    
    return model

# Загрузка неквантизованной модели
model = LlamaForCausalLM.from_pretrained("path_to_your_model")

# Квантизация модели
quantized_model = quantize_model(model)

In [None]:
# from quip import QUIP
from qlinear import QuantLinear
# from codebook import codebook_id
from codebook.e8p12_rvq4 import E8P12RVQ4B_codebook
from quantizer import QuipQuantizer

quip_quant_config = {
    "quant_method": "QUiP",
    "rescale_WH": False,
    "use_rand": True,
    "codebook": "E8P12RVQ4B",
    "codesz": 8,
    "idx_dtype": "torch.int32",
    "merge_suv": False,
    "per_channel": False,
    "opt_resid_scale": -1,
    "modules_to_not_convert": None,
    "inference": True,
    "ft_epochs": 0
}

class QuipLlamaForCausalLM(LlamaForCausalLM):
    def __init__(self, config, **kwargs):
        super().__init__(config, **kwargs)
        # self.codebook = codebook_id[config.quantization_config["codebook"]](inference=True)
        self.codebook = E8P12RVQ4B_codebook(inference=True)
        self._replace_with_quant_layers()
        self.config = config

    def _replace_with_quant_layers(self):
        for name, module in self.named_modules():
            if isinstance(module, torch.nn.Linear):
                in_features = module.in_features
                out_features = module.out_features
                bias = module.bias is not None
                new_module = QuantLinear(
                    in_features,
                    out_features,
                    self.codebook,
                    bias=bias,
                    use_rand=quip_quant_config["use_rand"],
                    per_channel=quip_quant_config["per_channel"]
                )
                parent_name, child_name = name.rsplit('.', 1)
                parent = self.get_submodule(parent_name)
                setattr(parent, child_name, new_module)

    # @staticmethod
    # def load_weights(model: "QuipLlamaForCausalLM", checkpoint_path: str):
    #     state_dict = torch.load(checkpoint_path, map_location="cpu")
    #     for name, param in model.named_parameters():
    #         if name in state_dict:
    #             if isinstance(param, QuantLinear):
    #                 param.pack(state_dict[name], state_dict[f"{name}_quantizer"])
    #             else:
    #                 param.data.copy_(state_dict[name])
    #     model.tie_weights()
    def load_weights(self, checkpoint_path: str):

        with init_empty_weights(include_buffers=False):
            model = AutoModelForCausalLM.from_config(
                self.config,
                trust_remote_code=True,
                torch_dtype=torch.float16)
        # model = self.model
            
        quantizer = QuipQuantizer.from_dict(quip_quant_config)
        model = quantizer.convert_model(model)

        load_checkpoint_and_dispatch(
            model,
            checkpoint=checkpoint_path,
            device_map="auto",
            no_split_module_classes=quantizer.get_no_split_module_classes(model),
            dtype=torch.float16,
        )

        model.is_quantized = True
        model.eval()
        self.model = model
        return model

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata: AttentionMetadata,
        intermediate_tensors: Optional[IntermediateTensors] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        model_output = self.model(input_ids, positions, kv_caches,
                                  attn_metadata, intermediate_tensors)
        return model_output

In [4]:
from vllm import ModelRegistry, LLM, SamplingParams
from transformers import AutoTokenizer

# from quip_llama import QuipLlamaForCausalLM
ModelRegistry.register_model("QuipLlamaForCausalLM", QuipLlamaForCausalLM)

In [5]:
from tqdm import tqdm

In [6]:
# for i in tqdm(range(10)):
#     pass

In [7]:
model_name = 'kharinaev/Llama-3.1-8B-Instruct-quip'

In [19]:
llm = LLM(
    model=model_name, 
    trust_remote_code=True,
    enforce_eager=True,
)

INFO 08-26 20:39:51 config.py:911] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 08-26 20:39:51 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='kharinaev/Llama-3.1-8B-Instruct-quip', speculative_config=None, tokenizer='kharinaev/Llama-3.1-8B-Instruct-quip', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=kharinaev/Llama-3.1-8B-Instruct-qui

TypeError: stat: path should be string, bytes, os.PathLike or integer, not generator

In [20]:
%debug

> [0;32m/usr/lib/python3.8/genericpath.py[0m(30)[0;36misfile[0;34m()[0m
[0;32m     28 [0;31m    [0;34m"""Test whether a path is a regular file"""[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     29 [0;31m    [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 30 [0;31m        [0mst[0m [0;34m=[0m [0mos[0m[0;34m.[0m[0mstat[0m[0;34m([0m[0mpath[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     31 [0;31m    [0;32mexcept[0m [0;34m([0m[0mOSError[0m[0;34m,[0m [0mValueError[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     32 [0;31m        [0;32mreturn[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  globals()


All Rights Reserved.

Copyright (c) 2000 BeOpen.com.
All Rights Reserved.

Copyright (c) 1995-2001 Corporation for National Research Initiatives.
All Rights Reserved.

Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
All Rights Reserved., 'credits':     Thanks to CWI, CNRI, BeOpen.com, Zope Corporation and a cast of thousands
    for supporting Python development.  See www.python.org for more information., 'license': Type license() to see the full license text, 'help': Type help() for interactive help, or help(object) for help about object., 'execfile': <function execfile at 0x7fcf40093820>, 'runfile': <function runfile at 0x7fcf2b6bab80>, '__IPYTHON__': True, 'display': <function display at 0x7fcf44fee280>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1011__': <capsule object NULL at 0x7fcf284713f0>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1014__': <capsule object NULL at 0x7fcdc73820c0>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1017__': <capsule object NULL

ipdb>  import sys
ipdb>  sys.modules['__main__'].weight_generator = path
ipdb>  q


In [10]:
llm

NameError: name 'llm' is not defined

In [21]:
globals()

{'__name__': '__main__',
 '__doc__': 'Automatically created module for IPython interactive environment',
 '__package__': None,
 '__loader__': None,
 '__spec__': None,
 '__builtin__': <module 'builtins' (built-in)>,
 '__builtins__': <module 'builtins' (built-in)>,
 '_ih': ['',
  'import os\nos.environ["CUDA_VISIBLE_DEVICES"] = "2"',
  'import torch\nfrom vllm.model_executor.models.llama import LlamaForCausalLM\nfrom transformers import AutoTokenizer, Auto\nConfig, AutoModelForCausalLM\n# from quip import QUIP\nfrom qlinear import QuantLinear\n# from codebook import codebook_id\nfrom codebook.e8p12_rvq4 import E8P12RVQ4B_codebook\nfrom accelerate import (\n    Accelerator,\n    cpu_offload_with_hook,\n    init_empty_weights,\n    load_checkpoint_and_dispatch\n)\nfrom quantizer import QuipQuantizer\n\nquip_quant_config = {\n    "quant_method": "QUiP",\n    "rescale_WH": False,\n    "use_rand": True,\n    "codebook": "E8P12RVQ4B",\n    "codesz": 8,\n    "idx_dtype": "torch.int32",\n    "me

In [23]:
next(iter(weight_generator))

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


  state = torch.load(bin_file, map_location="cpu")


('model.embed_tokens.weight',
 tensor([[ 0.0011,  0.0056, -0.0034,  ...,  0.0041, -0.0028, -0.0007],
         [-0.0037,  0.0010, -0.0018,  ...,  0.0015, -0.0023, -0.0014],
         [ 0.0014, -0.0170,  0.0032,  ...,  0.0030,  0.0095,  0.0049],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [ 0.0000, -0.0000, -0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000]]))