### Unsloth

In [None]:
from unsloth import FastLanguageModel  # FastVisionModel for LLMs
import torch
import json


max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit",
    max_seq_length = max_seq_length,
    load_in_4bit = load_in_4bit
)

FastLanguageModel.for_inference(model)

In [None]:
system_msg = """
(1) GOAL
You are an advanced accountant AI language model with specialized expertise in accounting mapping German item.
Your task:
- Read the user's "item_name".
- Choose one best match from the provided list "global_account_name" options.
- Determine how confident you are in that classification.
---------------
(2) RETURN FORMAT
You must return only one JSON object with this exact structure:
{
  "global_account_name": "<best match or 'Unknown'>",
  "confidence_status": "<'Sure' or 'Not Sure'>"
}
---------------
(3) WARNINGS
- Output nothing except the JSON object (no extra text, no commentary).
- If you feel truly uncertain about the correct classification, return "Unknown" for "global_account_name".
- Assign "confidence_status" to "Sure" if your confidence is at least 95%, else "Not Sure".
---------------
(4) CONTEXT DUMP
Below is the list of possible "global_account_name" options from which you must choose:
1. Wartungskosten für Hard- und Software
2. Bezugsnebenkosten
3. Buchführungskosten
4. Werbekosten
5. Erhaltene Skonti 19% Vorsteuer
6. Sonstiger Betriebsbedarf
7. Nebenkosten des Geldverkehrs
8. Rechts- und Beratungskosten
9. Bürobedarf
10. Telefon
11. Löhne
12. Verpackungsmaterial
13. Dekoration
14. Sonstige Abgaben
15. Beiträge
16. Versicherungen
17. Reinigung
18. Pauschale Steuer für Aushilfen
19. Aushilfslöhne
20. Gehälter
"""

In [None]:
system_message = '(1) GOAL\nYou are an advanced accountant AI language model with specialized expertise in accounting mapping German item.\nYour task:\n- Read the user\"s \"item_name\".\n- Choose one best match from the provided list \"global_account_name\" options.\n- Determine how confident you are in that classification.\n---------------\n(2) RETURN FORMAT\nYou must return only one JSON object with this exact structure:\n{\n  \"global_account_name\": \"<best match or \"Unknown\">\",\n  \"confidence_status\": \"<\"Sure\" or \"Not Sure\">\"\n}\n---------------\n(3) WARNINGS\n- Output nothing except the JSON object (no extra text, no commentary).\n- If you feel truly uncertain about the correct classification, return \"Unknown\" for \"global_account_name\".\n- Assign \"confidence_status\" to \"Sure\" if your confidence is at least 95%, else \"Not Sure\".\n---------------\n(4) CONTEXT DUMP\nBelow is the list of possible \"global_account_name\" options from which you must choose:\n'

account_name_list = "Wartungskosten für Hard- und Software| Bezugsnebenkosten| ABCS"
account_names = account_name_list.split('| ')

# Build the formatted string with numbering and newline characters
account_names_formatted_string = ''
for i, name in enumerate(account_names, 1):
    account_names_formatted_string += f"{i}. {name}\n"

system_message += account_names_formatted_string
print(system_message)

In [None]:
messages = [
    {"role": "system","content": json.dumps(system_msg, ensure_ascii=False)},
    {"role": "user","content": "Logistikpauschale"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

outputs = model.generate(input_ids=inputs, max_new_tokens=1500, use_cache=True, temperature=0.1, min_p=0.1)
tokenizer.batch_decode(outputs)

In [None]:
# from unsloth.chat_templates import get_chat_template
#
# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "deepseek-v2",
# )
#
# def formatting_prompts_func(examples):
#     convos = examples["conversations"]
#     texts = [
#         tokenizer.apply_chat_template(
#             convo, tokenize = False, add_generation_prompt = False
#         )
#         for convo in convos
#     ]
#     return { "text" : texts, }

In [None]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    for convo in convos:
        user = convo[0]["content"]
        assistant = convo[1]["content"]
        text = train_prompt_style.format(input, user, assistant) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [None]:
from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

In [None]:
from unsloth.chat_templates import standardize_sharegpt

dataset = standardize_sharegpt(dataset)
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

In [None]:
dataset[5]["conversations"]

In [None]:
dataset["text"][0]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    # data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 1,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
# from unsloth.chat_templates import train_on_responses_only
#
# trainer = train_on_responses_only(
#     trainer,
#     instruction_part="<|im_start|>user<|im_sep|>",
#     response_part="<|im_start|>assistant<|im_sep|>",
# )

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

In [None]:
# space = tokenizer(" ", add_special_tokens = False).input_ids[0]
# tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-4",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(
    input_ids = inputs, max_new_tokens = 64, use_cache = True, temperature = 1.5, min_p = 0.1
)
tokenizer.batch_decode(outputs)

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(
    input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
    use_cache = True, temperature = 1.5, min_p = 0.1
)

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Describe a tall tower in the capital of France."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(
    input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
    use_cache = True, temperature = 1.5, min_p = 0.1
)

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer

    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if True: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if True: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )