In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import BitsAndBytesConfig
from peft import PeftModel

adapter_id   = "JoonseoHyeon/not_finished" 
base_model = "meta-llama/Llama-3.2-3B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(adapter_id, trust_remote_code=True)
model = PeftModel.from_pretrained(model, adapter_id, trust_remote_code=True)

model = model.merge_and_unload()

loading configuration file config.json from cache at /home/tako/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B/snapshots/13afe5124825b4f3751f836b40dafda64c1ed062/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.52.3",

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-3B.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /home/tako/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B/snapshots/13afe5124825b4f3751f836b40dafda64c1ed062/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": 128001,
  "temperature": 0.6,
  "top_p": 0.9
}

loading file tokenizer.json from cache at /home/tako/.cache/huggingface/hub/models--JoonseoHyeon--not_finished/snapshots/05eba1dcfd9d8e3de4c53278e129b97aecbafc5f/tokenizer.json
loading file tokenizer.model from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_ma

In [12]:
model.save_pretrained("quantized_model")
tokenizer.save_pretrained("quantized_model")

Configuration saved in quantized_model/config.json
Configuration saved in quantized_model/generation_config.json
Model weights saved in quantized_model/model.safetensors
chat template saved in quantized_model/chat_template.jinja
tokenizer config file saved in quantized_model/tokenizer_config.json
Special tokens file saved in quantized_model/special_tokens_map.json


('quantized_model/tokenizer_config.json',
 'quantized_model/special_tokens_map.json',
 'quantized_model/chat_template.jinja',
 'quantized_model/tokenizer.json')

In [1]:
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.models import ModelInfo

model_client = OpenAIChatCompletionClient(
    model="rag/test/quantized_model/",
    base_url="http://localhost:8000/v1",
    model_info=ModelInfo(
        vision=False,
        function_calling=False,
        json_output=False,
        family="unknown",
        structured_output=True,
    ),
    api_key="s"
)

In [2]:
from autogen_agentchat.agents import AssistantAgent

assistant_agent = AssistantAgent(
    name="quantized_model",
    model_client=model_client,
    system_message="You are a helpful assistant.",
)

In [None]:
messages = "medqa에 대해서 알려줘."

response = await assistant_agent.run(task=messages)
content = response.messages[-1].content
content