In [None]:
import os
from huggingface_hub import login

hf_token = os.environ['HF_TOKEN']
login(hf_token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
# Load Dataset
import json
import os

DATA_DIR = "/workspace/human_dataset"

def load_data_from_json_files(directory):
    master_list = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
            master_list.append(data)
    return master_list

dataset = load_data_from_json_files(DATA_DIR)

In [None]:
from unsloth import FastModel

# Hugging Face model id
model_id = "google/gemma-3-4b-it"
lora_id = "IamJunhee/Gemma3-Agricsense_lora"

model, tokenizer = FastModel.from_pretrained(
    model_name = lora_id,
    max_seq_length = 20000,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from trl import SFTConfig

args = SFTConfig(
    output_dir=lora_id,     # directory to save and repository id
    num_train_epochs=10,                         # number of training epochs
    per_device_train_batch_size=1,              # batch size per device during training
    gradient_accumulation_steps=4,              # number of steps before performing a backward/update pass
    gradient_checkpointing=True,                # use gradient checkpointing to save memory
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    logging_steps=5,                            # log every 5 steps
    save_strategy="epoch",                      # save checkpoint every epoch
    learning_rate=2e-4,                         # learning rate, based on QLoRA paper
    bf16=True,                                  # use bfloat16 precision
    max_grad_norm=0.3,                          # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                          # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",               # use constant learning rate scheduler
    push_to_hub=True,                           # push model to hub
    report_to=None,                    # report metrics to tensorboard
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # use reentrant checkpointing
    dataset_text_field="",                      # need a dummy field for collator
    dataset_kwargs={"skip_prepare_dataset": True}, # important for collator
    remove_unused_columns=False, # important for collator
    max_seq_length = 20000,
    dataset_num_proc = 4
)

In [None]:
from PIL import Image
import numpy as np
import base64
from io import BytesIO

def load_and_process_image(b64: str) -> str:
    image_data_bytes = BytesIO(base64.b64decode(b64))
    image = Image.open(image_data_bytes)
    channels = len(image.getbands())
    
    if channels == 1:
        img = np.array(image)
        height, width = img.shape
        three_channel_array = np.zeros((height, width, 3), dtype=np.uint8)
    
        if img.dtype == np.uint8:
            img = img.astype(np.uint16)
            img = ((img / 255) * 65535).astype(np.uint16)
        
        three_channel_array[:, :, 0] = (img // 1024) * 2
        three_channel_array[:, :, 1] = (img // 32) * 8
        three_channel_array[:, :, 2] = (img % 32) * 8
        image = Image.fromarray(three_channel_array, "RGB")
    
    return image

def process_vision_info(messages: list[dict]) -> list[Image.Image]:
    image_inputs = []
    # Iterate through each conversation
    for msg in messages:
        # Get content (ensure it's a list)
        content = msg.get("content", [])
        if not isinstance(content, list):
            content = [content]

        # Check each content element for images
        for element in content:
            if isinstance(element, dict) and (
                "image" in element or element.get("type") == "image"
            ):
                # Get the image and convert to RGB
                if "base64" in element:
                    image = element["base64"]
                else:
                    image = element
                image_inputs.append(image)
    
    return [load_and_process_image(input).convert("RGB") for input in image_inputs]

In [None]:
# Create a data collator to encode text and image pairs
def collate_fn(examples):
    texts = []
    images = []
    for example in examples:
        image_inputs = process_vision_info(example["messages"])
        text = tokenizer.apply_chat_template(
            example["messages"], add_generation_prompt=False, tokenize=False
        )
        texts.append(text.strip())
        images.append(image_inputs)

    # Tokenize the texts and process the images
    batch = tokenizer(text=texts, images=images, return_tensors="pt", padding=True)

    # The labels are the input_ids, and we mask the padding tokens and image tokens in the loss computation
    labels = batch["input_ids"].clone()

    # Mask image tokens
    image_token_id = [
        tokenizer.tokenizer.convert_tokens_to_ids(
            tokenizer.tokenizer.special_tokens_map["boi_token"]
        )
    ]
    # Mask tokens for not being used in the loss computation
    labels[labels == tokenizer.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    labels[labels == 262144] = -100

    batch["labels"] = labels
    return batch

In [None]:
indices_to_del = []

for i in range(len(dataset)):
    try:
        batch = collate_fn([dataset[i]])
        token_len = batch["input_ids"].shape[1]

        if token_len > 25000:
            indices_to_del.append(i)
            print(f"{i} : {token_len} -> will be deleted")
            continue

        print(f"{i} : {token_len}")
        
    except Exception as e:
        print(f"{i} : error ({e}) -> will be deleted")
        indices_to_del.append(i)

for index in sorted(indices_to_del, reverse=True):
    del dataset[index]

print(f"Deletion complete. New dataset size: {len(dataset)}")

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=dataset,
    data_collator=collate_fn,
)

trainer.data_collator = collate_fn

[2025-03-29 07:24:16,844] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
trainer.save_model()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,12.1102
