In [19]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None 
load_in_4bit = True

model , tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length= max_seq_length,
    dtype=dtype,
    load_in_4bit= load_in_4bit
)

==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce GTX 1660 SUPER. Num GPUs = 1. Max memory: 5.606 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)

Unsloth 2025.3.18 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [8]:
template = """"Below is an instruction that describes a task, paired with an input that
provides further context. Write a response that appropriately completes the request.

### Instruction:
Company database: {}

### Input:
SQL Prompt: {}

### Response:
SQL:{}

Explanation: {}

"""

EOS_TOKEN = tokenizer.eos_token


def prompt_template(data):
    company_databases = data['sql_context']
    prompts = data['sql_prompt']
    sqls = data['sql']
    explanations = data['sql_explanation']
    texts = []

    for company_database, prompt, sql, explanation in zip(company_databases, prompts, sqls, explanations):
        text = template.format(company_database,prompt,sql,explanation) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

from datasets import load_dataset

dataset = load_dataset("gretelai/synthetic_text_to_sql", split="train")
dataset = dataset.map(prompt_template, batched = True)

Map: 100%|██████████| 100000/100000 [00:01<00:00, 81373.72 examples/s]


In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer, 
    train_dataset= dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16= not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs"
    )
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 100000/100000 [00:24<00:00, 4070.69 examples/s]


In [13]:
trainer_states = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Step,Training Loss
1,1.9889
2,1.7363
3,1.7945
4,1.7054
5,1.5453
6,1.4076
7,1.2498
8,1.2202
9,0.978
10,1.0543


Unsloth: Will smartly offload gradients to save VRAM!


In [14]:
# model.save_pretrained_gguf("dir", tokenizer, quantization_method = "f16")
model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)


# There is currently an error on save_pretrained_gguf so use 
# model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)

# #Then go to the console and 
# git clone --recursive https://github.com/ggerganov/llama.cpp
# make clean -C llama.cpp
# make all -j -C llama.cpp
# pip install gguf protobuf

# python llama.cpp/convert_hf_to_gguf.py merged_model --outfile output_model --outtype f16

# To Upload to Ollama
# Create a modelfile and put "FROM ./output_model"
# Then on the same directory "ollama create trained_model -f Modelfile"

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 10.61 out of 31.25 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 43.48it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [None]:
# Comments: Model is outputting bad. Probably due to the model.
# TODO: Use Jupyter Notebook instead of Local since GPU cant support stronger models.
# TODO: Explore Collaboratory or RunPOd

NameError: name 'dataset' is not defined