In [None]:
!pip install streamlit torch datasets huggingface_hub transformers trl
!pip install streamlit torch datasets
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
# Install necessary packages
!pip install streamlit transformers datasets huggingface_hub trl unsloth

Collecting streamlit
  Downloading streamlit-1.35.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.9.4-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6

In [None]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import json
import streamlit as st
import torch
from datasets import load_dataset
from huggingface_hub import login
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported

# User inputs
base_model = "unsloth/mistral-7b-v0.3-bnb-4bit"
finetuned_model_info = "finetuned_model"
max_seq_length = 2048
load_in_4bit = True

# LoRA configuration
r = 16
lora_alpha = 16
lora_dropout = 0.0
bias = "none"
dataset_info = "AnonY0324/orca-math-word-problems-200k"
split = "train"
input_field = "prompt"
batch_size = 2
gradient_accumulation_steps = 4
warmup_steps = 5
max_steps = 60
num_train_epochs = 1
learning_rate = 2e-4
logging_steps = 1
optim = "adamw_8bit"
weight_decay = 0.01
lr_scheduler_type = "linear"
seed = 3407
output_dir = "outputs"
hugging_face_username = "UKV"
hugging_face_token = "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"
online_save = ["local_save"]
save_methods = ["merged_16bit", "merged_4bit", "lora"]

# Login to Hugging Face
login(token=hugging_face_token)

def load_model(base_model, max_seq_length, load_in_4bit):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=base_model,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=load_in_4bit,
    )
    return model, tokenizer

def get_peft_model(_model, r, lora_alpha, bias):
    model = FastLanguageModel.get_peft_model(
        _model,
        r=r,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=lora_alpha,
        lora_dropout=0,
        bias=bias,
        use_gradient_checkpointing="unsloth",
        use_rslora=False,
        loftq_config=None,
    )
    return model
def load_dataset_train(dataset_info):
    alpaca_prompt = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
    ### Instruction:
    {}
    ### Input:
    {}
    ### Response:
    {}
    """
    EOS_TOKEN = tokenizer.eos_token
    def formatting_prompts_func(examples):
        instructions = examples["instruction"]
        inputs = examples["input"]
        outputs = examples["output"]
        texts = []
        for instruction, input, output in zip(instructions, inputs, outputs):
            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
            texts.append(text)
        return {"text": texts}

    dataset_train = load_dataset(dataset_info, split="train")
    dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
    return dataset_train
def setup_trainer(_model, _tokenizer, _dataset_train, _training_args, _dataset_text_field, _max_seq_length):
    trainer = SFTTrainer(
        model=_model,
        tokenizer=_tokenizer,
        train_dataset=_dataset_train,
        dataset_text_field=_dataset_text_field,
        max_seq_length=_max_seq_length,
        dataset_num_proc=2,
        packing=False,
        args=_training_args,
    )
    return trainer

config = {
    "hugging_face_username": hugging_face_username,
    "model_config": {
        "base_model": base_model,
        "finetuned_model": f"{hugging_face_username}/{finetuned_model_info}",
        "max_seq_length": max_seq_length,
        "load_in_4bit": load_in_4bit,
    },
    "lora_config": {
        "r": r,
        "lora_alpha": lora_alpha,
        "lora_dropout": lora_dropout,
        "bias": bias,
        "use_gradient_checkpointing": "unsloth",
        "use_rslora": False,
    },
    "training_dataset": {
        "name": dataset_info,
        "split": split,
        "input_field": input_field,
    },
    "training_config": {
        "per_device_train_batch_size": batch_size,
        "gradient_accumulation_steps": gradient_accumulation_steps,
        "warmup_steps": warmup_steps,
        "max_steps": max_steps,
        "num_train_epochs": num_train_epochs,
        "learning_rate": learning_rate,
        "fp16": not is_bfloat16_supported(),
        "bf16": is_bfloat16_supported(),
        "logging_steps": logging_steps,
        "optim": optim,
        "weight_decay": weight_decay,
        "lr_scheduler_type": lr_scheduler_type,
        "seed": seed,
        "output_dir": output_dir,
    }
}

model, tokenizer = load_model(
    config["model_config"]["base_model"],
    config["model_config"]["max_seq_length"],
    config["model_config"]["load_in_4bit"]
)

model = get_peft_model(
    model,
    r=config["lora_config"]["r"],
    lora_alpha=config["lora_config"]["lora_alpha"],
    bias=config["lora_config"]["bias"]
)

dataset_train = load_dataset_train(config["training_dataset"]["name"])

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    dataset_text_field="text",
    max_seq_length=config["model_config"]["max_seq_length"],
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=config["training_config"]["per_device_train_batch_size"],
        gradient_accumulation_steps=config["training_config"]["gradient_accumulation_steps"],
        warmup_steps=config["training_config"]["warmup_steps"],
        max_steps=config["training_config"]["max_steps"],
        learning_rate=config["training_config"]["learning_rate"],
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=config["training_config"]["logging_steps"],
        optim=config["training_config"]["optim"],
        weight_decay=config["training_config"]["weight_decay"],
        lr_scheduler_type=config["training_config"]["lr_scheduler_type"],
        seed=config["training_config"]["seed"],
        output_dir=config["training_config"]["output_dir"],
    ),
)

trainer.train()



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.
Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Downloading data:   0%|          | 0.00/82.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200035 [00:00<?, ? examples/s]

Map:   0%|          | 0/200035 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/200035 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 200,035 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.0197
2,1.031
3,0.8884
4,0.8103
5,0.8121
6,0.6993
7,0.5495
8,0.527
9,0.5605
10,0.5447


TrainOutput(global_step=60, training_loss=0.5107344165444374, metrics={'train_runtime': 772.0776, 'train_samples_per_second': 0.622, 'train_steps_per_second': 0.078, 'total_flos': 1.1520269080117248e+16, 'train_loss': 0.5107344165444374, 'epoch': 0.0023995680777460057})

In [None]:
save_path = "finetuned_model"

try:
    model.push_to_hub_gguf(save_path+"_f8", tokenizer, token="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")
    model.push_to_hub_gguf(save_path + "_f16", tokenizer, quantization_method="f16", token="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")
    model.push_to_hub_gguf(save_path + "_q4_k_m", tokenizer, quantization_method="q4_k_m", token="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")
except RuntimeError as e:
    print(f"Quantization failed with error: {e}")
    print("Ensure llama.cpp is correctly compiled and accessible.")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 4.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.26 out of 12.67 RAM for saving.


 59%|█████▉    | 19/32 [00:01<00:00, 15.69it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:21<00:00,  2.55s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving finetuned_model_f8/pytorch_model-00001-of-00003.bin...
Unsloth: Saving finetuned_model_f8/pytorch_model-00002-of-00003.bin...
Unsloth: Saving finetuned_model_f8/pytorch_model-00003-of-00003.bin...
Done.


Unsloth: Converting mistral model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GUUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['fast_quantized'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at finetuned_model_f8 into f16 GGUF format.
The output location will be ./finetuned_model_f8-unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: finetuned_model_f8
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 32768
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 14336
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value head count = 8
INFO:hf-to-gguf:gguf: rope theta = 1

In [None]:
!git clone --recursive https://github.com/ggerganov/llama.cpp
!cd llama.cpp && make clean && make all -j

Cloning into 'llama.cpp'...
remote: Enumerating objects: 27298, done.[K
remote: Counting objects: 100% (8432/8432), done.[K
remote: Compressing objects: 100% (450/450), done.[K
remote: Total 27298 (delta 8198), reused 7997 (delta 7982), pack-reused 18866[K
Receiving objects: 100% (27298/27298), 48.92 MiB | 14.64 MiB/s, done.
Resolving deltas: 100% (19532/19532), done.
Submodule 'kompute' (https://github.com/nomic-ai/kompute.git) registered for path 'kompute'
Cloning into '/content/llama.cpp/kompute'...
remote: Enumerating objects: 9090, done.        
remote: Counting objects: 100% (225/225), done.        
remote: Compressing objects: 100% (137/137), done.        
remote: Total 9090 (delta 99), reused 173 (delta 78), pack-reused 8865        
Receiving objects: 100% (9090/9090), 17.58 MiB | 36.96 MiB/s, done.
Resolving deltas: 100% (5706/5706), done.
Submodule path 'kompute': checked out '4565194ed7c32d1d2efa32ceab4d3c6cae006306'
I ccache not found. Consider installing it for faster 

In [None]:
!pip install bitsandbytes
!pip install transformers


Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (41