# Installs

In [None]:
!git clone https://github.com/ggerganov/llama.cpp

In [None]:
!cd llama.cpp && LLAMA_CUBLAS=1 make && pip install -r requirements/requirements-convert-hf-to-gguf.txt

In [None]:
!apt-get update;
!wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64 -O cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64.deb
!apt-key add /var/cuda-repo-10-0-local/7fa2af80.pub
!apt-get update
!apt-get -y install gcc-7 g++-7
!apt-get -y install cuda

!export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
!export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

# Download Model from HF

##### To save the model for transformation, it is necessary to use either `save_pretrain` for a pre-trained model or merge it with a checkpoint.

In [26]:
from huggingface_hub import snapshot_download

model_name = "Sparcos/TinyLlama_v1.1-qlora-finetunined-UFV"
base_model = "./original_model/"
quantized_path = "./quantized_model/"

In [41]:
snapshot_download(repo_id=model_name, local_dir=base_model , local_dir_use_symlinks=False)
original_model = quantized_path+'/FP16.gguf'

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

In [42]:
!mkdir ./quantized_model/

mkdir: cannot create directory ‘./quantized_model/’: File exists


# Convert Model to GGUF

In [43]:
!python llama.cpp/convert-hf-to-gguf.py ./original_model/ --outtype f16 --outfile ./quantized_model/FP16.gguf

INFO:hf-to-gguf:Loading model: original_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 2048
INFO:hf-to-gguf:gguf: embedding length = 2048
INFO:hf-to-gguf:gguf: feed forward length = 5632
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value head count = 4
INFO:hf-to-gguf:gguf: rope theta = 10000.0
INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-05
INFO:hf-to-gguf:gguf: file type = 1
INFO:hf-to-gguf:Set model tokenizer
INFO:gguf.vocab:Setting special token type bos to 1
INFO:gguf.vocab:Setting special token type eos to 2
INFO:gguf.vocab:Setting special token type unk to 0
INFO:gguf.vocab:Setting special token type pad to 2
INFO:gguf.vocab:Setting add_bos_token to True
INFO:gguf.vocab:Setting add_eos_token to False
INFO:hf-to-gguf:Exporting model to 'quantized_model/FP16.gguf'
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:output.weight,       

In [44]:
import os

## Quantize models

In [55]:
methods = ["Q4_K"]

for m in methods:
    qtype = f"{quantized_path}{m.upper()}.gguf"
    os.system("./llama.cpp/quantize "+quantized_path+"FP16.gguf "+qtype+" "+m)

# Push GGUF Model to HF

In [46]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [47]:
from huggingface_hub import HfApi, HfFolder, create_repo, upload_file

In [53]:
repo_name = "Sparcos/TinyLlama_v1.1-qlora-finetunined-UFV_GGUF"  # Desired HF Hub repository name
repo_url = create_repo(repo_name, private=False)

In [57]:
api = HfApi()

models = [
    {
        "path": "./quantized_model/FP16.gguf",
        "repo_path": "FP16.gguf"
    }
]

for archive in models:
    api.upload_file(
        path_or_fileobj= archive['path'],
        path_in_repo= archive['repo_path'],
        repo_id=repo_name,
        repo_type="model",
    )