# Qwen2.5-VL + Sensor Training Walkthrough

이 노트북은 `model_with_sensor.py` 기반 학습 파이프라인을 한 단계씩 확인하기 위한 참고용 스크립트입니다.
각 셀을 순서대로 실행하면서 데이터 로더, 캐시 생성, 최종 학습 루프가 모두 동작하는지 점검할 수 있습니다.


In [2]:
# ✅ 프로젝트 경로 및 기본 환경 설정
import os
from pathlib import Path
import torch

PROJECT_ROOT = Path(r"/home/najo/NAS/VLA/Insertion_VLA")
os.chdir(PROJECT_ROOT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Project root: {{PROJECT_ROOT}}')
print(f'Using device: {{device}}')
if device.type == 'cuda':
    print(f'CUDA device name: {torch.cuda.get_device_name(device)}')


Project root: {PROJECT_ROOT}
Using device: {device}
CUDA device name: NVIDIA GeForce RTX 3090


## 1. 학습/캐시 구성 파라미터
필요에 따라 경로와 하이퍼파라미터를 수정하세요. 데이터 루트는 실제 JSON/이미지 파일이 존재하는 위치여야 합니다.


In [12]:
from datetime import datetime
from pathlib import Path

DATA_ROOT = Path('/home/najo/NAS/VLA/dataset')  # ⚠️ 데이터 경로를 실제 위치로 수정하세요.
CACHE_DIR = Path('/home/najo/NAS/VLA/dataset/cache/qwen_vl_features')
VL_MODEL_NAME = 'Qwen/Qwen2.5-VL-3B-Instruct'

# 학습 하이퍼파라미터
BATCH_SIZE = 1
VAL_RATIO = 0.05
NUM_EPOCHS = 1
GRAD_ACCUM_STEPS = 8
BASE_LR = 1e-4
VL_LR = 1e-5
VISION_LR = 5e-6
MIN_LR = 1e-6
WARMUP_RATIO = 0.03
HOLD_RATIO = 0.02
SCHED_ON = 'step'  # 'step' 또는 'epoch'

# VL 백본 파인튜닝 옵션
FINETUNE_VL = 'lora'  # {'none', 'lora', 'full'}
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
UNFREEZE_LAST_N = 2

# 캐시 설정
CACHE_MODE = 'auto'  # {'auto', 'strict', 'off'}
CACHE_MAX_GB = 20.0
CACHE_WARMUP_SAMPLES = 16  # 캐시 리허설용 샘플 수

RUN_ID = datetime.now().strftime('notebook_%m%d_%H%M')
CHECKPOINT_PATH = Path('./checkpoints') / f'{RUN_ID}_latest.pt'
CHECKPOINT_PATH.parent.mkdir(exist_ok=True, parents=True)
print(f'Checkpoints will be saved to: {CHECKPOINT_PATH}')


Checkpoints will be saved to: checkpoints/notebook_1028_0144_latest.pt


## 2. 데이터셋 로딩
Meca500 / ZED JSON 경로를 자동으로 수집한 뒤 `insertionMeca500Dataset`으로 불러옵니다.
데이터 구조가 다르다면 이 셀을 수정해 주세요.


In [13]:
from Total_Dataset import insertionMeca500Dataset, collate_fn
from torch.utils.data import ConcatDataset, random_split, DataLoader, Subset

assert DATA_ROOT.exists(), f'Data root not found: {DATA_ROOT}'
meca_jsons = sorted(DATA_ROOT.glob('OCT_insertion/Captures*/Captures*_precise_9views.json'))
zed_jsons = sorted(DATA_ROOT.glob('part*/ZED_Captures_*th/ZED_Captures_*th_precise_8views.json'))

if not meca_jsons:
    raise FileNotFoundError('Meca500 JSON 경로를 찾을 수 없습니다. DATA_ROOT를 확인하세요.')
if not zed_jsons:
    raise FileNotFoundError('ZED JSON 경로를 찾을 수 없습니다. DATA_ROOT를 확인하세요.')

def build_insertion_dataset(json_paths, horizon=8):
    return ConcatDataset([insertionMeca500Dataset(json_path=str(p), horizon=horizon) for p in json_paths])

meca_ds = build_insertion_dataset(meca_jsons)
zed_ds = build_insertion_dataset(zed_jsons)
full_dataset = ConcatDataset([meca_ds, zed_ds])
print(f'Total samples: {len(full_dataset)} (Meca: {len(meca_ds)}, ZED: {len(zed_ds)})')

val_len = max(1, int(len(full_dataset) * VAL_RATIO)) if len(full_dataset) > 1 else 0
train_len = len(full_dataset) - val_len
if val_len > 0 and train_len > 0:
    train_dataset, val_dataset = random_split(full_dataset, [train_len, val_len])
else:
    train_dataset, val_dataset = full_dataset, None

loader_kwargs = dict(
    batch_size=BATCH_SIZE,
    num_workers=4,
    collate_fn=collate_fn,
    pin_memory=(device.type == 'cuda'),
)
if loader_kwargs['num_workers'] > 0:
    loader_kwargs['persistent_workers'] = True
    loader_kwargs['prefetch_factor'] = max(2, BATCH_SIZE)

train_loader = DataLoader(train_dataset, shuffle=True, **loader_kwargs)
val_loader = None
if val_dataset is not None and len(val_dataset) > 0:
    val_loader = DataLoader(val_dataset, shuffle=False, **loader_kwargs)

example_batch = next(iter(train_loader))
print(f"Loaded example batch -> actions shape: {example_batch['actions'].shape}")


Total samples: 9832 (Meca: 1953, ZED: 7879)
Loaded example batch -> actions shape: torch.Size([1, 8, 7])


## 3. 모델 초기화 및 캐시 설정
`Not_freeze_QwenVLAWithSensor`를 그대로 사용합니다. 필요한 경우 센서 모듈 활성화나 파인튜닝 모드를 변경하세요.


In [14]:
from model_with_sensor import Not_freeze_QwenVLAWithSensor
from A5st_VLA_TRAIN_VL_Lora import unwrap_model

model = Not_freeze_QwenVLAWithSensor(
    vl_model_name=VL_MODEL_NAME,
    action_dim=7,
    horizon=8,
    hidden_dim=1024,
    finetune_vl=FINETUNE_VL,
    lora_r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    unfreeze_last_n=UNFREEZE_LAST_N,
    sensor_enabled=False,
    fusion_strategy='none',
    cache_dir=str(CACHE_DIR),
).to(device)

module_ref = unwrap_model(model)
module_ref.set_cache_limit(CACHE_MAX_GB)
if CACHE_MODE == 'strict':
    module_ref.set_cache(True)
    module_ref.set_strict_cache(True)
elif CACHE_MODE == 'off':
    module_ref.set_cache(False)
    module_ref.set_strict_cache(False)
else:
    module_ref.set_cache(True)
    module_ref.set_strict_cache(False)

trainable_params = sum(p.numel() for p in module_ref.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in module_ref.parameters())
print(f'Trainable params: {trainable_params/1e6:.2f}M / Total: {total_params/1e6:.2f}M')


🚀 Loading Trainable Qwen-VL-Sensor Model
   VL Fine-tuning: lora
   Sensor Enabled: False
   Fusion Strategy: none
🧠 Trying attn_implementation=flash_attention_2 with dtype=torch.bfloat16...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Successfully loaded with flash_attention_2 (torch.bfloat16)
✅ QwenActionExpertWithSensor initialized with fusion strategy: none
💡 Applying LoRA fine-tuning...
Trainable params: 76.68M / Total: 3831.30M


## 4. (선택) VL 캐시 리허설
대규모 학습 전에 소수의 샘플로 캐시 생성이 정상 동작하는지 확인합니다.


In [16]:
from Make_VL_cache import build_vl_cache_distributed_optimized

if CACHE_MODE != 'off':
    warmup_count = min(CACHE_WARMUP_SAMPLES, len(train_dataset))
    cache_subset = train_dataset if warmup_count == len(train_dataset) else Subset(train_dataset, range(warmup_count))
    print(f'Building cache for {len(cache_subset)} samples...')
    build_vl_cache_distributed_optimized(
        model=module_ref,
        dataset=cache_subset,
        device=device,
        batch_size=BATCH_SIZE,
        max_cache_gb=CACHE_MAX_GB,
    )
else:
    print('Cache mode is OFF — skipping cache warmup.')


Building cache for 16 samples...


Building VL cache:   0%|          | 0/16 [00:00<?, ?it/s]

Building VL cache:   0%|          | 0/16 [00:02<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:3! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

## 5. 옵티마이저 & 스케줄러 구성
학습 스크립트와 동일한 weight decay 분할 로직을 재사용합니다.


In [None]:
import math
from torch.optim import AdamW
from 5st_VLA_TRAIN_VL_Lora import build_trapezoid_scheduler

def wd_filter(name, param):
    if param.ndim == 1:
        return False
    if name.endswith('.bias'):
        return False
    return True

ae_named = list(module_ref.action_expert.named_parameters())
vl_named = list(module_ref.vl_model.named_parameters())

ae_decay = [p for n, p in ae_named if wd_filter(n, p) and p.requires_grad]
ae_n_decay = [p for n, p in ae_named if not wd_filter(n, p) and p.requires_grad]
vl_decay = [p for n, p in vl_named if wd_filter(n, p) and p.requires_grad]
vl_n_decay = [p for n, p in vl_named if not wd_filter(n, p) and p.requires_grad]
vision_decay, vision_n_decay = [], []
for n, p in vl_named:
    if not p.requires_grad:
        continue
    if any(key in n for key in ('vision', 'visual', 'vision_tower')):
        (vision_decay if wd_filter(n, p) else vision_n_decay).append(p)

param_groups = [
    {'params': ae_decay, 'lr': BASE_LR, 'weight_decay': 0.01},
    {'params': ae_n_decay, 'lr': BASE_LR, 'weight_decay': 0.0},
]
if FINETUNE_VL == 'lora':
    param_groups += [
        {'params': vl_decay, 'lr': VL_LR, 'weight_decay': 0.01},
        {'params': vl_n_decay, 'lr': VL_LR, 'weight_decay': 0.0},
    ]
elif FINETUNE_VL == 'full':
    param_groups += [
        {'params': vision_decay, 'lr': VISION_LR, 'weight_decay': 0.01},
        {'params': vision_n_decay, 'lr': VISION_LR, 'weight_decay': 0.0},
        {'params': vl_decay, 'lr': VL_LR, 'weight_decay': 0.01},
        {'params': vl_n_decay, 'lr': VL_LR, 'weight_decay': 0.0},
    ]

optimizer = AdamW(param_groups, betas=(0.9, 0.999), eps=1e-8)

steps_per_epoch = math.ceil(len(train_loader) / max(1, GRAD_ACCUM_STEPS))
total_steps = max(1, steps_per_epoch * NUM_EPOCHS)
scheduler = build_trapezoid_scheduler(
    optimizer,
    total_steps=total_steps,
    base_lr=BASE_LR,
    min_lr=MIN_LR,
    warmup_ratio=WARMUP_RATIO,
    hold_ratio=HOLD_RATIO,
)
print(f'Scheduler total steps: {total_steps}')


## 6. 학습 루프 실행
`Train` 함수를 그대로 호출하되 W&B 로깅은 비활성화하여 노트북 실험에 집중합니다.


In [None]:
from 5st_VLA_TRAIN_VL_Lora import Train

Train(
    model=model,
    data_loader=train_loader,
    optimizer=optimizer,
    num_epochs=NUM_EPOCHS,
    grad_accum_steps=GRAD_ACCUM_STEPS,
    device=device,
    save_path=str(CHECKPOINT_PATH),
    scheduler=scheduler,
    sched_on=SCHED_ON,
    val_loader=val_loader,
    start_epoch=0,
    enable_wandb=False,
)
print('Training run finished.')


## 7. 추론/검증 샘플 확인
학습 후 모델이 배치 하나를 처리하는지 간단히 점검합니다.


In [None]:
model.eval()
with torch.no_grad():
    sample = next(iter(train_loader))
    actions = sample['actions'].to(device, dtype=torch.bfloat16)
    preds, _ = model(
        text_inputs=sample['instruction'],
        image_inputs=sample['images'],
        z_chunk=actions,
        sensor_data=sample.get('sensor_data'),
        cache_keys=sample['cache_keys'],
    )
print(f'Predicted actions shape: {preds.shape}')
