In [1]:
!pip install --upgrade pip --quiet

[0m

In [2]:
!pip install transformers peft accelerate huggingface_hub torc sentencepiece --quiet

[0m

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import torch

# 1. Verify adapter config first
adapter_path = "llama3-fingpt-lora"
peft_config = PeftConfig.from_pretrained(adapter_path)
print(f"Adapter config: {peft_config}")

# 2. Load base model with correct architecture
base_model_name = "meta-llama/Meta-Llama-3-8B"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

# 3. Load adapter with strict=False to handle mismatches
model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
    device_map="auto",
    is_trainable=False,  # Disable training if not needed
    config=peft_config   # Explicitly pass the config
)

# 4. Merge carefully
merged_model = model.merge_and_unload()

# 5. Save merged model
save_path = "merged_fingpt-llama3"
merged_model.save_pretrained(save_path)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.save_pretrained(save_path)

Adapter config: LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='meta-llama/Meta-Llama-3-8B', revision=None, inference_mode=True, r=16, target_modules={'q_proj', 'v_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

SafetensorError: Error while deserializing header: MetadataIncompleteBuffer

In [5]:
import os
from safetensors import safe_open

adapter_path = "llama3-fingpt-lora"

# Check if files exist
required_files = ["adapter_model.safetensors", "adapter_config.json"]
for file in required_files:
    if not os.path.exists(os.path.join(adapter_path, file)):
        print(f"❌ Missing file: {file}")
    else:
        print(f"✅ Found: {file}")

# Validate safetensors file
try:
    with safe_open(f"{adapter_path}/adapter_model.safetensors", framework="pt") as f:
        print(f"✔ File is valid. Contains {len(f.keys())} weight tensors")
except Exception as e:
    print(f"❌ Corrupted file: {str(e)}")

✅ Found: adapter_model.safetensors
✅ Found: adapter_config.json
✔ File is valid. Contains 256 weight tensors


In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Load base model (adjust path if needed)
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

# Load adapter (using your verified files)
model = PeftModel.from_pretrained(
    base_model,
    "llama3-fingpt-lora",
    device_map="auto"
)

# Merge and save
merged_model = model.merge_and_unload()
merged_model.save_pretrained("merged_fingpt-llama3")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer.save_pretrained("merged_fingpt-llama3")

print("✅ Merged model saved")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

✅ Merged model saved


In [None]:
!git clone https://github.com/ggerganov/llama.cpp

In [9]:
!pip install gguf

Collecting gguf
  Downloading gguf-0.14.0-py3-none-any.whl.metadata (3.7 kB)
Downloading gguf-0.14.0-py3-none-any.whl (76 kB)
Installing collected packages: gguf
Successfully installed gguf-0.14.0
[0m

In [10]:
import gguf

In [14]:
!git clone https://github.com/ggml-org/llama.cpp.git

Cloning into 'llama.cpp'...
remote: Enumerating objects: 48781, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 48781 (delta 134), reused 66 (delta 66), pack-reused 48599 (from 3)[K
Receiving objects: 100% (48781/48781), 102.95 MiB | 22.12 MiB/s, done.
Resolving deltas: 100% (35121/35121), done.
Updating files: 100% (1284/1284), done.


In [1]:
!python3 convert_lora_to_gguf.py merged-fingpt-llama3 --outfile fingpt-llama3.gguf --outtype f16

python3: can't open file '/workspace/convert_lora_to_gguf.py': [Errno 2] No such file or directory


In [19]:
!cd ~/llama.cpp

/bin/bash: line 1: cd: /root/llama.cpp: No such file or directory


In [2]:
!python3 convert_lora_to_gguf.py merged-fingpt-llama3 --outfile fingpt-llama3.gguf --outtype f16

python3: can't open file '/workspace/convert_lora_to_gguf.py': [Errno 2] No such file or directory
