### Install & Setup llama.cpp

In [None]:
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt

In [None]:
# Restart the env and run
!git lfs install

### Download the finetune model for quantization

In [None]:
from huggingface_hub import snapshot_download

MODEL_ID = "Mr-TD/Llama-2-7b-hf-finetune-MOM_Summary-Points"
MODEL_NAME = MODEL_ID.split('/')[-1]
snapshot_download(repo_id=MODEL_ID,local_dir=MODEL_NAME , local_dir_use_symlinks=False)

### This step can take a while. Once it’s done, we need to convert our weight to GGML FP16 format.

In [None]:
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}

#### Here is a list of all the possible quant methods and their corresponding use cases, based on model cards made by TheBloke:

q2_k: Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.<br>

q3_k_l: Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K<br>

q3_k_m: Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K<br>

q3_k_s: Uses Q3_K for all tensors<br>

q4_0: Original quant method, 4-bit.<br>

q4_1: Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.<br>

q4_k_m: Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K<br>

q4_k_s: Uses Q4_K for all tensors<br>

q5_0: Higher accuracy, higher resource usage and slower inference.<br>

q5_1: Even higher accuracy, resource usage and slower inference.<br>

q5_k_m: Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K<br>

q5_k_s: Uses Q5_K for all tensors<br>

q6_k: Uses Q8_K for all tensors<br>

q8_0: Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.<br>

### Finally, we can quantize the model using one or several methods. In this case, we will use the all methods and test. This is the only step that actually requires a GPU.

In [None]:
method = 'q2_k'
qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
!./llama.cpp/quantize {fp16} {qtype} {method}

### Now we can push quntized model to hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="Llama-2-7b-hf-finetune-MOM_Summary-Points/llama-2-7b-hf-finetune-mom_summary-points.Q2_K.gguf",
    repo_id="Mr-TD/Llama-2-7b-hf-finetune-MOM",
    path_in_repo='Llama-2-7b-MOM_Summary.Q2_K.gguf',
    repo_type="model"
)