In [2]:
import json
import os
import glob

LOG_DIR = "./dataset_logs"
OUTPUT_FILE = "vlm_sft_dataset.json"


def create_sft_dataset(log_dir):
    sft_data = []

    # Find all JSON log files
    json_files = glob.glob(os.path.join(log_dir, "log_*.json"))

    print(f"Found {len(json_files)} log files. Processing...")

    for json_path in json_files:
        try:
            with open(json_path, "r") as f:
                data = json.load(f)

            scene_id = data.get("scene", "")
            run_idx = data.get("run_idx", 0)
            raw_dialogue = data.get("dialogue", [])

            # Construct the conversation list for this specific run
            conversation = []

            # We track the round index to find the matching image (r0, r1, r2...)
            current_round_idx = 0

            for idx, message in enumerate(raw_dialogue):
                role = message.get("role")
                content = message.get("content")

                if role == "system":
                    # System prompt stays text-only
                    conversation.append({"role": "system", "content": content})

                    img_filename = (
                        f"{scene_id}_run{run_idx}_r{current_round_idx}_ego.jpg"
                    )
                    img_path = os.path.join(log_dir, img_filename)
                    conversation.append(
                        {
                            "role": "user",
                            "content": [
                                {"type": "image", "image": img_path},
                                {
                                    "type": "text",
                                    "text": "This is the current camera view.",
                                },
                            ],
                        }
                    )

                    current_round_idx += 1

                elif role == "user":
                    # This is where we inject the image
                    # e.g., "0010_run0_r0_ego.jpg"
                    img_filename = (
                        f"{scene_id}_run{run_idx}_r{current_round_idx}_ego.jpg"
                    )
                    img_path = os.path.join(log_dir, img_filename)

                    # Check if the image actually exists for this round
                    if os.path.exists(img_path):
                        # Create the multimodal user message
                        user_turn = {
                            "role": "user",
                            "content": [
                                {"type": "image", "image": img_path},
                                {"type": "text", "text": content},
                            ],
                        }
                        # Increment round index only after we consume an image for a user turn
                        current_round_idx += 1
                    else:
                        # Fallback if image is missing: keep text only
                        user_turn = {"role": "user", "content": content}

                    conversation.append(user_turn)

                elif role == "assistant":
                    # Assistant replies are usually text/code
                    conversation.append({"role": "assistant", "content": content})

            # Add this conversation to the master list
            sft_data.append(conversation)

        except Exception as e:
            print(f"Error processing {json_path}: {e}")

    # Save the final dataset
    with open(OUTPUT_FILE, "w") as f:
        json.dump(sft_data, f, indent=4)

    print(f"Successfully processed {len(sft_data)} dialogues.")
    print(f"Saved to {OUTPUT_FILE}")


if __name__ == "__main__":
    create_sft_dataset(LOG_DIR)

Found 20 log files. Processing...
Successfully processed 20 dialogues.
Saved to vlm_sft_dataset.json


## Load Model

In [6]:
%pip install -U transformers
%pip install -U peft bitsandbytes accelerate datasets trl qwen-vl-utils
%pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting torchvision
  Downloading torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.9 kB)
Collecting torchaudio
  Downloading torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.9 kB)
Downloading torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl (8.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torchvision, torchaudio
Successfully installed torchaudio-2.9.1 torchvision-0.24.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"

# Configure 4-bit quantization to fit in VRAM
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load Processor
# min_pixels/max_pixels control the resolution. 
# Reducing max_pixels saves VRAM significantly.
processor = AutoProcessor.from_pretrained(
    MODEL_ID, 
    min_pixels=256*28*28, 
    max_pixels=640*28*28 
)

# Load Model
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16
)

video_preprocessor_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.72G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]