# Phase 3: Optimization (CPU Version)

This notebook performs the following steps:
1.  **Setup**: Install dependencies and build `llama.cpp`.
2.  **Merge**: Merge the LoRA adapter (from `model.zip`) with the Base Model (`Qwen2.5-Coder-0.5B-Instruct`).
3.  **Convert**: Convert the merged model to GGUF format.
4.  **Quantize**: Quantize the model to 4-bit (`q4_k_m`).
5.  **Benchmark**: Test performance using `llama-bench`.

**Hardware**: CPU (GPU is not required for this notebook).

## 1. Setup Environment

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

!pip uninstall -y -q onnx tensorflow-metadata bigframes opentelemetry-proto

!pip install -q -U transformers huggingface_hub sentencepiece
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q protobuf==3.20.3

print("SETUP COMPLETE.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# Clone and build llama.cpp using CMake
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
!cmake -B build
!cmake --build build --config Release -j$(nproc)
%cd ..

Cloning into 'llama.cpp'...
remote: Enumerating objects: 72473, done.[K
remote: Counting objects: 100% (399/399), done.[K
remote: Compressing objects: 100% (307/307), done.[K
remote: Total 72473 (delta 263), reused 92 (delta 92), pack-reused 72074 (from 3)[K
Receiving objects: 100% (72473/72473), 242.47 MiB | 33.18 MiB/s, done.
Resolving deltas: 100% (52308/52308), done.
/kaggle/working/llama.cpp
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The

## 2. Prepare Data
Upload model to Kaggle

In [2]:
import os
import zipfile
import shutil
INPUT_PATH = "/kaggle/input/ai-auto"  # Kaggle dataset path
SFT_ADAPTER_NAME = "final_model"           # SFT adapter folder name
DPO_ADAPTER_NAME = "dpo_final_model"       # DPO adapter folder name
SFT_ADAPTER_DIR = "sft_adapter"
DPO_ADAPTER_DIR = "dpo_adapter" 
SFT_MERGED_DIR = "sft_merged_model"
DPO_MERGED_DIR = "dpo_merged_model"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
def find_and_copy_adapter(input_path, adapter_name, output_dir):
    if os.path.isdir(input_path):
        for root, dirs, files in os.walk(input_path):
            if adapter_name in dirs:
                src = os.path.join(root, adapter_name)
                if os.path.exists(os.path.join(src, "adapter_config.json")):
                    if os.path.exists(output_dir):
                        shutil.rmtree(output_dir)
                    shutil.copytree(src, output_dir)
                    print(f"✓ Found {adapter_name} -> {output_dir}")
                    return True
            if "adapter_config.json" in files and adapter_name in root:
                if os.path.exists(output_dir):
                    shutil.rmtree(output_dir)
                shutil.copytree(root, output_dir)
                print(f"✓ Found {adapter_name} -> {output_dir}")
                return True
    return False
sft_found = find_and_copy_adapter(INPUT_PATH, SFT_ADAPTER_NAME, SFT_ADAPTER_DIR)
if not sft_found:
    print(f"SFT adapter '{SFT_ADAPTER_NAME}' not found")
dpo_found = find_and_copy_adapter(INPUT_PATH, DPO_ADAPTER_NAME, DPO_ADAPTER_DIR)
if not dpo_found:
    print(f"DPO adapter '{DPO_ADAPTER_NAME}' not found")
print(f"\nSFT adapter: {'✓' if sft_found else '✗'}")
print(f"DPO adapter: {'✓' if dpo_found else '✗'}")

✓ Found final_model -> sft_adapter
✓ Found dpo_final_model -> dpo_adapter

SFT adapter: ✓
DPO adapter: ✓


## 3. Merge Model (Base + Adapter)

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import gc
def merge_lora_to_base(base_model_id, adapter_dir, output_dir, model_name="Model"):
    if not os.path.exists(adapter_dir):
        print(f"{model_name}: Adapter not found at {adapter_dir}")
        return False
    
    print(f"\n{'='*50}")
    print(f"Processing: {model_name}")
    print(f"{'='*50}")
    
    print(f"Loading base model: {base_model_id}")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        torch_dtype=torch.float16,
        device_map="cpu",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    
    print(f"Loading adapter from: {adapter_dir}")
    model = PeftModel.from_pretrained(base_model, adapter_dir)
    
    print("Merging...")
    model = model.merge_and_unload()
    
    print(f"Saving to: {output_dir}")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    del model, base_model
    gc.collect()
    
    print(f"{model_name} merged successfully")
    return True
if os.path.exists(SFT_ADAPTER_DIR):
    merge_lora_to_base(BASE_MODEL_ID, SFT_ADAPTER_DIR, SFT_MERGED_DIR, "SFT Model")
if os.path.exists(DPO_ADAPTER_DIR):
    merge_lora_to_base(BASE_MODEL_ID, DPO_ADAPTER_DIR, DPO_MERGED_DIR, "DPO Model")

2025-12-17 09:48:03.883467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765964884.105595      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765964884.167690      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'


Processing: SFT Model
Loading base model: Qwen/Qwen2.5-Coder-0.5B-Instruct


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Loading adapter from: sft_adapter




Merging...
Saving to: sft_merged_model
SFT Model merged successfully

Processing: DPO Model
Loading base model: Qwen/Qwen2.5-Coder-0.5B-Instruct
Loading adapter from: dpo_adapter
Merging...
Saving to: dpo_merged_model
DPO Model merged successfully


## 4. Convert to GGUF and Quantize

In [5]:
!pip install -r llama.cpp/requirements.txt

def convert_and_quantize(merged_dir, output_name, quant_type="q4_k_m"):
    if not os.path.exists(merged_dir):
        print(f"{merged_dir} not found, skipping...")
        return None
    
    f16_path = f"{output_name}-f16.gguf"
    quantized_path = f"{output_name}-{quant_type}.gguf"
    
    print(f"\n{'='*50}")
    print(f"Converting: {merged_dir}")
    print(f"{'='*50}")
    
    !python llama.cpp/convert_hf_to_gguf.py {merged_dir} --outfile {f16_path} --outtype f16
    
    quantize_bin = "llama.cpp/build/bin/llama-quantize"
    if not os.path.exists(quantize_bin):
        quantize_bin = "llama.cpp/llama-quantize"
    
    !./{quantize_bin} {f16_path} {quantized_path} {quant_type}
    
    if os.path.exists(f16_path):
        os.remove(f16_path)
    
    if os.path.exists(quantized_path):
        size_mb = os.path.getsize(quantized_path) / (1024*1024)
        print(f"✓ Created: {quantized_path} ({size_mb:.1f} MB)")
        return quantized_path
    return None
sft_gguf = convert_and_quantize(SFT_MERGED_DIR, "qwen2.5-coder-sft")
dpo_gguf = convert_and_quantize(DPO_MERGED_DIR, "qwen2.5-coder-dpo")
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
if sft_gguf:
    print(f"✓ SFT GGUF: {sft_gguf}")
if dpo_gguf:
    print(f"✓ DPO GGUF: {dpo_gguf}")

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/nightly, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/nightly, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/nightly
Ignoring torch: markers 'platform_machine == "s390x"' don't match your environment
Ignoring torch: markers 'platform_machine == "s390x"' don't match your environment
Collecting gguf>=0.1.0 (from -r llama.cpp/./requirements/requirements-convert_legacy_llama.txt (line 6))
  Downloading https://download.pytorch.org/whl/nightly/gguf-0.17.1-py3-none-any.whl.metadata (4.3 kB)
Collecting protobuf<5.0.0,>=4.21.0 (from -r llama.cpp/./requirements/requirements-convert_legacy_llama.txt (line 7))
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting aiohttp~=3.9.3 (from -r llama.cpp/./requirements/requirements-tool_bench.txt (line 1))
  Downloading https://download.pytorch.org

## 6. Benchmarking

In [7]:
# Run benchmark
bench_bin = "llama.cpp/build/bin/llama-bench"
cli_bin = "llama.cpp/build/bin/llama-cli"

# Check and build if missing
if not os.path.exists(bench_bin):
    if os.path.exists("llama.cpp/llama-bench"): bench_bin = "llama.cpp/llama-bench"
if not os.path.exists(cli_bin):
    if os.path.exists("llama.cpp/llama-cli"): cli_bin = "llama.cpp/llama-cli"
    elif os.path.exists("llama.cpp/main"): cli_bin = "llama.cpp/main"

print("=== THROUGHPUT BENCHMARK ===")
if os.path.exists(bench_bin):
    !./{bench_bin} -m {quantized_gguf} -n 128 -p 1024
else:
    print(f"ERROR: Could not find llama-bench at {bench_bin}")

print("\n=== LATENCY TEST (Single Request) ===")
if os.path.exists(cli_bin):
    !./{cli_bin} -m {quantized_gguf} -p "def fibonacci(n):" -n 32 -c 1024 -t 2
else:
    print(f"ERROR: Could not find llama-cli at {cli_bin}")

=== THROUGHPUT BENCHMARK ===
| model                          |       size |     params | backend    | threads |            test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | --------------: | -------------------: |
| qwen2 1B Q4_K - Medium         | 373.71 MiB |   494.03 M | CPU        |       2 |          pp1024 |         62.41 ± 0.08 |
| qwen2 1B Q4_K - Medium         | 373.71 MiB |   494.03 M | CPU        |       2 |           tg128 |         23.17 ± 0.09 |

build: 4d3726278 (7315)

=== LATENCY TEST (Single Request) ===
build: 7315 (4d3726278) with GNU 11.4.0 for Linux x86_64
main: llama backend init
main: load the model and apply lora adapter, if any
llama_model_loader: loaded meta data with 27 key-value pairs and 290 tensors from qwen2.5-coder-0.5b-merged-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0

In [9]:
from IPython.display import FileLink

print("Click to download:")
display(FileLink(output_gguf))
display(FileLink(quantized_gguf))

Click to download:
