# Merge the base model with the LoRa in a GGUF file executable by a CPU

### Merging

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Paths to your base model and adapter
base_model_path = "unsloth/Llama-3.2-3B-Instruct"  # Original model (e.g., LLaMA)
adapter_model_path = "ID2223-Lab/llama_lora_adapter"  # LoRA adapter files (e.g., adapter_model.safetensors)

# Load base model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)

# Load the LoRA adapter and merge into the base model
model = PeftModel.from_pretrained(base_model, adapter_model_path)
model = model.merge_and_unload()  # Merge LoRA weights into the base model

# Save the fully merged model
model.save_pretrained("merged_model")
tokenizer.save_pretrained("merged_model")  # Save the tokenizer too

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/928 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/tokenizer.json')

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git

Cloning into 'llama.cpp'...
remote: Enumerating objects: 38883, done.[K
remote: Counting objects: 100% (24123/24123), done.[K
remote: Compressing objects: 100% (1218/1218), done.[K
remote: Total 38883 (delta 23337), reused 22931 (delta 22904), pack-reused 14760 (from 1)[K
Receiving objects: 100% (38883/38883), 56.74 MiB | 16.54 MiB/s, done.
Resolving deltas: 100% (28901/28901), done.


In [None]:
!pip install -r llama.cpp/requirements.txt

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu
Collecting gguf>=0.1.0 (from -r llama.cpp/./requirements/requirements-convert_legacy_llama.txt (line 4))
  Downloading gguf-0.10.0-py3-none-any.whl.metadata (3.5 kB)
Collecting torch~=2.2.1 (from -r llama.cpp/./requirements/requirements-convert_hf_to_gguf.txt (line 3))
  Downloading https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp310-cp310-linux_x86_64.whl (186.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m186.8/186.8 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Downloading gguf-0.10.0-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.6/71.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gguf, torch
  Attempting uninstall: torch
    Found existing installation: torch 2.5.1+cu121
    Un

In [None]:
!python llama.cpp/convert_hf_to_gguf.py merged_model --outfile FineTune_Llama.gguf --outtype q8_0

INFO:hf-to-gguf:Loading model: merged_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00003.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float32 --> Q8_0, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float32 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float32 --> Q8_0, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float32 --> Q8_0, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float32 --> Q8_0, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float32 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float32 --> Q8_0, shape = {3072, 

In [None]:
from huggingface_hub import login

login(token="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")

In [None]:
from huggingface_hub import HfApi, upload_file

# Initialize the HfApi class
api = HfApi()

api.upload_file(
    path_or_fileobj="FineTune_Llama.gguf",
    path_in_repo="FineTune_Llama.gguf",
    repo_id="ID2223-Lab/llama_lora_merged_GGUF",
    repo_type="model",

)

FineTune_Llama.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ID2223-Lab/llama_lora_merged_GGUF/commit/5a051aca1e68a75698deac7deeb389f9331ffa65', commit_message='Upload FineTune_Llama.gguf with huggingface_hub', commit_description='', oid='5a051aca1e68a75698deac7deeb389f9331ffa65', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ID2223-Lab/llama_lora_merged_GGUF', endpoint='https://huggingface.co', repo_type='model', repo_id='ID2223-Lab/llama_lora_merged_GGUF'), pr_revision=None, pr_num=None)