In [None]:
!pip install wandb -q

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

### Unsloth

In [None]:
import huggingface_hub
from google.colab import userdata
import json

token = userdata.get('HF_TOKEN_Inventors')
huggingface_hub.login(token)
%env HF_HUB_ENABLE_HF_TRANSFER=True

env: HF_HUB_ENABLE_HF_TRANSFER=True


In [None]:
import wandb

wandb.login(key=userdata.get('wandb_API'))

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmoh_you1990[0m ([33mmohammed-majeed[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_name= "tiiuae/Falcon3-10B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/947M [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/365k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],#,"gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.6.2 patched 40 layers with 40 QKV layers, 40 O layers and 0 MLP layers.


<a name="Data"></a>
### Data Prep

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

SYSTEM:
{}

INSTRUCTIONS:
{}

USER COMMAND:
{}

OUTPUT:
{}
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    system = examples["SYSTEM"]
    instructions = examples["INSTRUCTIONS"]
    user_command = examples["USER COMMAND"]
    outputs = examples["OUTPUT"]
    texts = []
    for system, instruction, input, output in zip(system, instructions, user_command, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(system, instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


In [None]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("Inventors-Hub/SwarmChat-BehaviorTree-Dataset", token=token)

# Assuming the dataset provides 'train' and 'validation' splits:
train_dataset = dataset["train"].map(formatting_prompts_func, batched = True,)
eval_dataset = dataset["validation"].map(formatting_prompts_func, batched = True,)


README.md:   0%|          | 0.00/3.39k [00:00<?, ?B/s]

train_data.jsonl:   0%|          | 0.00/2.95M [00:00<?, ?B/s]

validation_data.jsonl:   0%|          | 0.00/845k [00:00<?, ?B/s]

test_data.jsonl:   0%|          | 0.00/422k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1443 [00:00<?, ? examples/s]

Map:   0%|          | 0/412 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

train_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    # Use a ratio rather than a fractional step count:
    warmup_ratio=0.1,
    num_train_epochs=3,

    # Proper evaluation & logging settings:
    eval_strategy="epoch",   # <-- run eval every eval_steps
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",

    # You can still do an intermediate eval if you want:
    eval_steps=100,

    learning_rate=1e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    weight_decay=0.01,
    lr_scheduler_type="linear",
    optim="adamw_8bit",
    seed=3407,
    report_to="wandb",

    # Early stopping / best‐model:
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=train_args,
)


Unsloth: Tokenizing ["text"]:   0%|          | 0/1443 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/412 [00:00<?, ? examples/s]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA L4. Max memory = 22.161 GB.
9.805 GB of memory reserved.


In [None]:
wandb.init(project="SwarmChat-12-6", job_type="training", anonymous='allow', name=f'{model_name}-3-epochs')

trainer_stats = trainer.train()
wandb.finish()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,443 | Num Epochs = 3 | Total steps = 543
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 13,107,200/10,000,000,000 (0.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.2449,0.231941
2,0.2083,0.203662
3,0.2099,0.197724


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


0,1
eval/loss,█▂▁
eval/runtime,█▃▁
eval/samples_per_second,▁▆█
eval/steps_per_second,▁▆█
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▇█▅▂▂▃▄▃▂▂▂▁▂▂▃▁▂▂▁▃▃▂▂▂▂▁▂▃▂▂▂▃▃▂▃▂▂▂▁▂
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇▆▅▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.19772
eval/runtime,151.8852
eval/samples_per_second,2.713
eval/steps_per_second,1.356
total_flos,1.45521360639744e+17
train/epoch,3.0
train/global_step,543.0
train/grad_norm,0.2314
train/learning_rate,0.0
train/loss,0.2099


<a name="Inference"></a>
### Inference


In [None]:
sys = """<<SYS>>You are a helpful, respectful, and honest AI assistant. Your task is to generate well-structured XML code for behavior trees based on the provided instructions.<</SYS>>"""
istr= """It is CRITICAL to use only the following behaviors structured as a dictionary:
say: Action Node: Speak the provided message using text-to-speech if it hasn't been spoken before. Args: message (str): The message to be spoken. Returns: Always returns SUCCESS, indicating the action was executed.
flocking: Action Node: Adjust the agent's move vector by blending alignment and separation forces from nearby agents. Returns: Always returns SUCCESS, indicating the action was executed.
align_with_swarm: Action Node: Align the agent's move vector with the average movement of nearby agents. Returns: Always returns SUCCESS, indicating the action was executed.
is_obstacle_detected: Condition node: Determine if any obstacles are detected in the vicinity of the agent. Returns: SUCCESS if an obstacle is detected, FAILURE otherwise.
avoid_obstacle: Action node: Execute an action to avoid detected obstacles. Returns: Always returns SUCCESS, indicating the action was executed.
is_target_detected: Condition node: Check if the target is within a detectable distance from the agent's position. Returns: SUCCESS if the target is within 20 units of distance, FAILURE otherwise.
is_target_reached: Condition node: Check if the agent has reached the target. Returns: SUCCESS if the target is within 15 units of distance, FAILURE otherwise.
change_color: Change the agent's color to 'white', 'green', or 'red'. Args: color (str): Color name. Returns: Always returns SUCCESS, indicating the action was executed.
is_agent_in_nest: Condition node: Determine if the agent is in the nest. Returns: SUCCESS if the agent is in the nest, FAILURE otherwise.
agent_movement_freeze: Action node: Freeze the agent's movement, typically to indicate a stop in activity. Returns: Always returns SUCCESS, indicating the action was executed.
continue_movement_agent: Action node: Continue the agent's movement after it has been previously frozen. Returns: Always returns SUCCESS, indicating the action was executed.
wander: Action node: Perform a wandering action where the agent moves randomly within the environment. Returns: Always returns SUCCESS, indicating the action was executed.
is_path_clear: Condition node: Check if the path ahead of the agent is clear of obstacles. Returns: SUCCESS if no obstacles are detected ahead, FAILURE if obstacles are present.
is_line_formed: Condition node: Determine if the agent has formed a line with a reference point at the center of the window. Returns: SUCCESS if the line is formed with the center, FAILURE otherwise.
form_line: Action node: Direct the agent to form a line towards the center of the window. This function adjuststhe agent's position to align it with the center. Returns: Always returns SUCCESS,
indicating the action was executed. to construct behavior tree in XML format to the following command, including in the behaviour tree a behaviour that is not in the provided dictionary can result in damage to the agents, and potentially humans, therefore you are not allowed to do so, AVOID AT ALL COSTS.
"""
user= """generate behavior tree to "form a line". Take a step back and think deeply about the behavior you need for this command. Take another step back and think of the xml structure and the behavior you used.
The output MUST follow this XML structure exactly, including:
- A root element with <root BTCPP_format and main_tree_to_execute attributes.
- A <BehaviorTree> element with an inner structure of Sequences, Fallback, Conditions, and Actions.
- A <TreeNodesModel> section listing all node models.
- No additional text or commentary outside the XML.
Output only the XML behavior tree without extra text."""


In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        sys,
        istr, # instruction
        user, # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\nSYSTEM:\n<<SYS>>You are a helpful, respectful, and honest AI assistant. Your task is to generate well-structured XML code for behavior trees based on the provided instructions.<</SYS>>\n\nINSTRUCTIONS:\nIt is CRITICAL to use only the following behaviors structured as a dictionary:\nsay: Action Node: Speak the provided message using text-to-speech if it hasn\'t been spoken before. Args: message (str): The message to be spoken. Returns: Always returns SUCCESS, indicating the action was executed.\nflocking: Action Node: Adjust the agent\'s move vector by blending alignment and separation forces from nearby agents. Returns: Always returns SUCCESS, indicating the action was executed.\nalign_with_swarm: Action Node: Align the agent\'s move vector with the average movement of nearby agents. Returns: Always returns SUCCESS, indicatin

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        sys,
        istr, # instruction
        user, # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

SYSTEM:
<<SYS>>You are a helpful, respectful, and honest AI assistant. Your task is to generate well-structured XML code for behavior trees based on the provided instructions.<</SYS>>

INSTRUCTIONS:
It is CRITICAL to use only the following behaviors structured as a dictionary:
say: Action Node: Speak the provided message using text-to-speech if it hasn't been spoken before. Args: message (str): The message to be spoken. Returns: Always returns SUCCESS, indicating the action was executed.
flocking: Action Node: Adjust the agent's move vector by blending alignment and separation forces from nearby agents. Returns: Always returns SUCCESS, indicating the action was executed.
align_with_swarm: Action Node: Align the agent's move vector with the average movement of nearby agents. Returns: Always returns SUCCESS, indicating the action w

<a name="Save/Push Model"></a>
### Save/Push Model

In [None]:
# Create a new repository for the quantized model variant:

repo_name = f'Inventors-Hub/{model_name.split("/")[1]}-BehaviorTree-3-epochs'
repo_url = huggingface_hub.create_repo(repo_name, token=token, repo_type="model", exist_ok=True)
print("Repository URL:", repo_url)

model.push_to_hub(repo_name, token=token)
tokenizer.push_to_hub(repo_name, token=token)

Repository URL: https://huggingface.co/Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs


Uploading...:   0%|          | 0.00/52.5M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

In [None]:
# Create a new repository for the quantized model variant:
repo_name_gguf = f'Inventors-Hub/{model_name.split("/")[1]}-BehaviorTree-3-epochs-GGUF'
repo_url = huggingface_hub.create_repo(repo_name_gguf, token=token, repo_type="model", exist_ok=True)
print("Quantized model repository URL:", repo_url)

# Push quantized models to this repository:
model.push_to_hub_gguf(repo_name_gguf, tokenizer, quantization_method="q4_k_m", token=token)
model.push_to_hub_gguf(repo_name_gguf, tokenizer, quantization_method="f16", token=token)


Quantized model repository URL: https://huggingface.co/Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF


Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 20.6G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 32.69 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 68%|██████▊   | 27/40 [00:00<00:00, 51.33it/s]
We will save to Disk and not RAM now.
100%|██████████| 40/40 [00:18<00:00,  2.18it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF into bf16 GGUF format.
The output location will be /content/Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading

Uploading...:   0%|          | 0.00/6.29G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 32.26 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 40/40 [02:30<00:00,  3.77s/it]


Unsloth: Saving tokenizer... Done.
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF into f16 GGUF format.
The output location will be /content/Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model

Uploading...:   0%|          | 0.00/20.6G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/Inventors-Hub/Falcon3-10B-Instruct-BehaviorTree-3-epochs-GGUF
