In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [2]:
import unsloth
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
max_seq_length = 8192
# dtype = 'torch.float16'
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=max_seq_length,
    # device_map="cuda:0",
    dtype=torch.float16,
    load_in_4bit=load_in_4bit,
)


==((====))==  Unsloth 2025.5.6: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.5.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(example):
    return {
        "text": alpaca_prompt.format(
            "Provide a helpful response to the user's mental health concern.",
            example["question"],
            example["answer"]
        ) + EOS_TOKEN
    }

from datasets import load_dataset

# Load the dataset
dataset = load_dataset("json", data_files="data_1.json")

# Apply formatting function
dataset = dataset.map(formatting_prompts_func)




Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3195 [00:00<?, ? examples/s]

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 80,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/3195 [00:00<?, ? examples/s]

In [7]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
7.117 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,195 | Num Epochs = 1 | Total steps = 80
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.0428
2,1.991
3,2.026
4,1.8546
5,1.5738
6,1.4966
7,1.4639
8,1.2259
9,1.0936
10,1.0491


In [10]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Provide a helpful response to the user's query on gst law.", # instruction
        "What are the consequences if an Input Service Distributor distributes credit in excess to a recipient, and how is the excess credit recovered from the recipient along with interest?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 164, use_cache = True)
tokenizer.batch_decode(outputs)

["<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nProvide a helpful response to the user's query on gst law.\n\n### Input:\nWhat are the consequences if an Input Service Distributor distributes credit in excess to a recipient, and how is the excess credit recovered from the recipient along with interest?\n\n### Response:\nThe consequences if an Input Service Distributor distributes credit in excess to a recipient are as follows:\n\n* The Input Service Distributor is required to recover the excess credit from the recipient along with interest.\n* The Input Service Distributor shall issue a debit note to the recipient for the excess credit distributed.\n* The recipient shall be liable to pay the amount of excess credit distributed along with interest.\n\nThe excess credit is recovered from the recipient along with interest as follows:\n\n* Th

In [19]:
! git clone https://github.com/ggerganov/llama.cpp.git
! cd llama.cpp
! make

Cloning into 'llama.cpp'...
remote: Enumerating objects: 51077, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 51077 (delta 8), reused 12 (delta 7), pack-reused 51050 (from 1)[K
Receiving objects: 100% (51077/51077), 116.66 MiB | 15.60 MiB/s, done.
Resolving deltas: 100% (36944/36944), done.
make: *** No targets specified and no makefile found.  Stop.


In [20]:
model.save_pretrained_gguf("dir", tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.26 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [02:02<00:00,  3.82s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving dir/pytorch_model-00001-of-00004.bin...
Unsloth: Saving dir/pytorch_model-00002-of-00004.bin...
Unsloth: Saving dir/pytorch_model-00003-of-00004.bin...
Unsloth: Saving dir/pytorch_model-00004-of-00004.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...


RuntimeError: Unsloth: The file ('llama.cpp/llama-quantize' or 'llama.cpp/llama-quantize.exe' if you are on Windows WSL) or 'llama.cpp/quantize' does not exist.
But we expect this file to exist! Maybe the llama.cpp developers changed the name or check extension of the llama-quantize file.

In [10]:
model.save_pretrained("GST_BOT_1") # Local saving

In [None]:
!ls -lh GST_BOT_1/


total 161M
-rw-r--r-- 1 root root  857 May 12 08:45 adapter_config.json
-rw-r--r-- 1 root root 161M May 12 08:45 adapter_model.safetensors
-rw-r--r-- 1 root root 5.0K May 12 08:45 README.md


In [None]:
!zip -r lora_model.zip GST_BOT_1/
from google.colab import files
files.download("GST_BOT_1")



  adding: GST_BOT_1/ (stored 0%)
  adding: GST_BOT_1/adapter_model.safetensors (deflated 8%)
  adding: GST_BOT_1/adapter_config.json (deflated 56%)
  adding: GST_BOT_1/README.md (deflated 66%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(GST_BOT_1) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Provide a helpful response to the user's query on gst law.", # instruction
        "What are the consequences if an Input Service Distributor distributes credit in excess to a recipient, and how is the excess credit recovered from the recipient along with interest?",  # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

["<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nProvide a helpful response to the user's query on gst law.\n\n### Input:\nWhat are the consequences if an Input Service Distributor distributes credit in excess to a recipient, and how is the excess credit recovered from the recipient along with interest?\n\n### Response:\nThe consequences if an Input Service Distributor distributes credit in excess to a recipient are that the excess credit shall be recovered from the recipient along with interest. The Input Service Distributor shall issue an invoice in respect of the excess credit distributed, and the recipient shall pay the amount of such excess credit along with interest as determined by"]

In [None]:
import shutil

# Zip the directory
shutil.make_archive("GST_BOT_1", 'zip', "GST_BOT_1")


'/content/GST_BOT_1.zip'

In [None]:
from google.colab import files
files.download("GST_BOT_1.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>