# Fine-Tuning Llama 3 and Using It Locally

https://www.datacamp.com/tutorial/llama3-fine-tuning-locally

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
%%capture
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN_LLAMA3.1")
login(token = hf_token)

wb_token = user_secrets.get_secret("wandb")
wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on Medical Dataset',
    job_type="training",
    anonymous="allow"
)

In [None]:
torch_dtype         = torch.float16
attn_implementation = "eager"

In [None]:
base_model   = 'meta-llama/Meta-Llama-3.1-8B-Instruct'

dataset_name = 'Adun/katunyou01'

#save model to gdrive
lora_model   = "/kaggle/working/TrainModel/Meta-Llama-3.1-8B-Instruct-katunyou01"

In [None]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer        = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="all")
#dataset = dataset.shuffle(seed=65).select()
dataset = dataset.shuffle(seed=42).select(range(247))    # Only use 1000 samples for quick demo

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["input"]},
               {"role": "assistant", "content": row["output"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

In [None]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

In [None]:
training_arguments = TrainingArguments(
    output_dir=lora_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=10,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True
)
# report_to="wandb"

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
trainer.train()

In [None]:
wandb.finish()
model.config.use_cache = True

In [None]:
torch.cuda.empty_cache()
messages = [
    {
        "role": "user",
        "content": "คุณเป็นใคร"
    }
]

prompt  = tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True)

inputs  = tokenizer(prompt, return_tensors='pt', padding=True,truncation=True).to("cuda")
outputs = model.generate(**inputs, max_length=512,num_return_sequences=1)

text    = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])

# Save model

In [None]:
trainer.model.save_pretrained(lora_model)
# trainer.model.push_to_hub(lora_model, use_temp_dir=False)

In [None]:
!zip -r /kaggle/working/Meta-Llama-3.1-8B-Instruct-katunyou01.zip /kaggle/working/TrainModel/Meta-Llama-3.1-8B-Instruct-katunyou01

download the model

# Merging Llama3 + Lora

In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [None]:
base_model  = 'meta-llama/Meta-Llama-3.1-8B-Instruct'

lora_model  = "/kaggle/input/llama3.1-lora-adaptor-katunyouai/transformers/default/1/checkpoint-1110"

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN_LLAMA3.1")
login(token = hf_token)

In [None]:
import torch
from peft import PeftModel
from trl import setup_chat_format
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

torch_dtype         = torch.float16
attn_implementation = "eager"

# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

print("load base model")
base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
#         quantization_config=bnb_config,
)

print("setup chat format")
base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
print("load lora")
model = PeftModel.from_pretrained(base_model_reload, lora_model)
print("merge")
model = model.merge_and_unload()

In [None]:
# model.save_pretrained("JuniorThap/Meta-Llama-3.1-8B-Instruct-QLoRA-KatuntyouAI-v1")
# tokenizer.save_pretrained("JuniorThap/Meta-Llama-3.1-8B-Instruct-QLoRA-KatuntyouAI-v1")

In [None]:
# from huggingface_hub import login
# login(token=user_secrets.get_secret("HF_TOKEN"))
# model.push_to_hub("JuniorThap/Meta-Llama-3.1-8B-Instruct-QLoRA-KatuntyouAI-v1")
# tokenizer.push_to_hub("JuniorThap/Meta-Llama-3.1-8B-Instruct-QLoRA-KatuntyouAI-v1")

In [None]:
from huggingface_hub import HfApi
from huggingface_hub import login

login(token=user_secrets.get_secret("HF_TOKEN"))

api = HfApi()
api.upload_folder(folder_path="/kaggle/working/JuniorThap/Meta-Llama-3.1-8B-Instruct-QLoRA-KatuntyouAI-v1", repo_id="JuniorThap/Meta-Llama-3.1-8B-Instruct-QLoRA-KatuntyouAI-v1")

In [None]:
messages = [{"role": "user", "content": "คุณคือ?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipe   = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=128, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

# Converting the HF Model to Llama.cpp GGUF

In [None]:
%cd /content
!git clone https://github.com/ggerganov/llama.cpp

### Convert GGUF FP16

In [None]:
!python llama.cpp/convert_hf_to_gguf.py /content/drive/MyDrive/BaseModel/llama-3-typhoon-v1.5x-8b-instruct \
  --outfile /content/drive/MyDrive/BaseModel/llama-3-typhoon-v1.5x-8b-instruct/llama-3-typhoon-v1.5x-8b-instruct-gguf-fp16.gguf \
  --outtype f16

### Convert GGUF Q8_0

In [None]:
!python llama.cpp/convert_hf_to_gguf.py https://huggingface.co/scb10x/llama-3-typhoon-v1.5x-8b-instruct \
  --outfile llama-3-typhoon-v1.5x-8b-instruct-gguf-q8.gguf \
  --outtype q8_0

In [None]:
# %cd /content
# !git clone --depth=1 https://github.com/ggerganov/llama.cpp.git
%cd llama.cpp
!sed -i 's|MK_LDFLAGS   += -lcuda|MK_LDFLAGS   += -L/usr/local/nvidia/lib64 -lcuda|' Makefile
!LLAMA_CUDA=1 make -j > /dev/null

In [None]:
!python convert.py /kaggle/input/fine-tuned-adapter-to-full-model/llama-3-8b-instruct-pantip/ \
    --outfile /kaggle/working/llama-3-8b-instruct-pantip-f16.gguf \
    --outtype f16 \
    --vocab-type bpe

# Quantizing the GGUF Q4 model

In [None]:
!/content/llama.cpp/llama-quantize \
/content/drive/MyDrive/BaseModel/llama-3-typhoon-v1.5x-70b-instruct/llama-3-typhoon-v1.5x-70b-instruct-gguf-fp16.gguf \
llama-3-typhoon1.5x-70-instruct-Q4_K_M.gguf \
Q4_K_M

In [None]:
# %cd /kaggle/working/
# !./llama.cpp/quantize /kaggle/input/hf-llm-to-gguf/llama-3-8b-chat-doctor.gguf llama-3-8b-chat-doctor-Q4_K_M.gguf Q4_K_M

### Push to Huggingface

In [None]:
from huggingface_hub import login, HfApi

hf_token     = ""
login(token = hf_token)

api = HfApi()
api.upload_file(
    path_or_fileobj="llama-3-typhoon-v1.5x-8b-instruct-pantip",
    path_in_repo="llama-3-typhoon-v1.5x-8b-instruct-pantip-Q4_K_M.gguf",
    repo_id="Adun/llama-3-typhoon-v1.5x-8b-instruct-gguf",
    repo_type="model",
)
