In [1]:
import os
import sys
import json
import torch
import cv2
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim

# CUDA同期エラー特定
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# VideoMAE モデルのパス
sys.path.append("/home/ollo/VideoMAE")
from modeling_finetune import vit_base_patch16_224

# ✅ Ego4D Dataset
class Ego4DDataset(Dataset):
    def __init__(self, annotation_file, video_root, transform=None, num_frames=16):
        with open(annotation_file, "r") as f:
            data = json.load(f)

        self.annotations = data["annotations"]
        self.video_root = video_root
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        try:
            item = self.annotations[idx]
            video_path = os.path.join(self.video_root, item["video_url"])

            if not os.path.exists(video_path):
                raise FileNotFoundError(f"動画ファイルが見つかりません: {video_path}")

            label = int(item["label"][0])
            if not (0 <= label < 57):
                raise ValueError(f"❌ 無効なラベル値: {label}（範囲外）")

            cap = cv2.VideoCapture(video_path)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            frames = []

            for i in range(self.num_frames):
                frame_id = int(i * total_frames / self.num_frames)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
                ret, frame = cap.read()
                if not ret:
                    print(f"⚠️ フレーム取得失敗: {frame_id} @ {video_path}")
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                if self.transform:
                    frame = self.transform(frame)
                frames.append(frame)
            cap.release()

            if not frames:
                raise RuntimeError(f"❌ フレームが取得できません: {video_path}")

            while len(frames) < self.num_frames:
                frames.append(frames[-1])

            video_tensor = torch.stack(frames).permute(1, 0, 2, 3)  # [C, T, H, W]
            return video_tensor, label

        except Exception as e:
            print(f"❌ __getitem__ エラー at idx={idx}: {e}")
            raise

# ✅ トレーニング関数
def train_model():
    # パス設定
    annotation_dir = "/home/ollo/videomae-clean"
    video_root = "/srv/shared/data/ego4d/short_clips/verb_annotation_simple"
    checkpoint_path = "/home/ollo/VideoMAE/checkpoints/vit_b_hybrid_pt_800e_k710_ft.pth"

    train_json = os.path.join(annotation_dir, "20250512_annotations_train.json")
    val_json = os.path.join(annotation_dir, "20250512_annotations_val.json")

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    # DataLoader
    train_dataset = Ego4DDataset(train_json, video_root, transform)
    val_dataset = Ego4DDataset(val_json, video_root, transform)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4)

    # ✅ モデル構築（先にto(device)してmeta tensorを防ぐ）
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = vit_base_patch16_224(all_frames=16, img_size=224, use_checkpoint=True, num_classes=57)
    model.to(device)  # ← meta tensor 回避のため早めにGPU転送

    # ✅ 未初期化(meta)チェック（デバッグ用）
    for name, param in model.named_parameters():
        if param.device.type == 'meta':
            print(f"❌ Meta tensor detected in: {name}")

    # ✅ チェックポイント読み込み
    checkpoint = torch.load(checkpoint_path, map_location="cpu")

    # ✅ 柔軟に state_dict 抽出
    if "module" in checkpoint:
        state_dict = checkpoint["module"]
    elif "model" in checkpoint:
        state_dict = checkpoint["model"]
    else:
        state_dict = checkpoint

    # ✅ head は削除して読み込む
    new_state_dict = {k: v for k, v in state_dict.items() if not k.startswith("head.")}
    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
    print("🔍 missing keys:", missing_keys)
    print("🔍 unexpected keys:", unexpected_keys)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    criterion = nn.CrossEntropyLoss()
    num_epochs = 5

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for videos, labels in train_loader:
            videos, labels = videos.to(device), labels.to(device)

            if labels.min() < 0 or labels.max() >= 57:
                print(f"❌ 不正なラベル検出: {labels}")
                continue

            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"🔁 Epoch {epoch+1}/{num_epochs} | Train Loss: {total_loss:.4f}")

        # ✅ 検証
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for videos, labels in val_loader:
                videos, labels = videos.to(device), labels.to(device)
                outputs = model(videos)
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        acc = 100.0 * correct / total
        print(f"✅ Val Accuracy: {acc:.2f}%")

    # ✅ モデル保存
    save_path = os.path.join(annotation_dir, "finetuned_model.pth")
    torch.save(model.state_dict(), save_path)
    print(f"💾 モデル保存完了: {save_path}")

# ✅ 実行
if __name__ == "__main__":
    train_model()


  from .autonotebook import tqdm as notebook_tqdm
  @register_model
  @register_model
  @register_model
  @register_model
  @register_model


🔍 missing keys: ['head.weight', 'head.bias']
🔍 unexpected keys: []
❌ __getitem__ エラー at idx=1619: ❌ 無効なラベル値: 57（範囲外）
❌ __getitem__ エラー at idx=54216: ❌ 無効なラベル値: 57（範囲外）




ValueError: Caught ValueError in DataLoader worker process 1.
Original Traceback (most recent call last):
  File "/home/ollo/.cache/pypoetry/virtualenvs/videomae-clean-Ug0YGy1k-py3.11/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/ollo/.cache/pypoetry/virtualenvs/videomae-clean-Ug0YGy1k-py3.11/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ollo/.cache/pypoetry/virtualenvs/videomae-clean-Ug0YGy1k-py3.11/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/var/tmp/ipykernel_2010762/2674311221.py", line 42, in __getitem__
    raise ValueError(f"❌ 無効なラベル値: {label}（範囲外）")
ValueError: ❌ 無効なラベル値: 57（範囲外）


In [2]:
import os
import sys
import json
import torch
import cv2
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim

# CUDA同期エラー特定
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# VideoMAE モデルのパス
sys.path.append("/home/ollo/VideoMAE")
from modeling_finetune import vit_base_patch16_224

# ✅ Ego4D Dataset
class Ego4DDataset(Dataset):
    def __init__(self, annotation_file, video_root, transform=None, num_frames=16):
        with open(annotation_file, "r") as f:
            data = json.load(f)

        self.annotations = data["annotations"]
        self.video_root = video_root
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        try:
            item = self.annotations[idx]
            video_path = os.path.join(self.video_root, item["video_url"])

            if not os.path.exists(video_path):
                raise FileNotFoundError(f"動画ファイルが見つかりません: {video_path}")

            label = int(item["label"][0])
            if not (0 <= label < 58):
                raise ValueError(f"❌ 無効なラベル値: {label}（範囲外）")

            cap = cv2.VideoCapture(video_path)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            frames = []

            for i in range(self.num_frames):
                frame_id = int(i * total_frames / self.num_frames)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
                ret, frame = cap.read()
                if not ret:
                    print(f"⚠️ フレーム取得失敗: {frame_id} @ {video_path}")
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                if self.transform:
                    frame = self.transform(frame)
                frames.append(frame)
            cap.release()

            if not frames:
                raise RuntimeError(f"❌ フレームが取得できません: {video_path}")

            while len(frames) < self.num_frames:
                frames.append(frames[-1])

            video_tensor = torch.stack(frames).permute(1, 0, 2, 3)  # [C, T, H, W]
            return video_tensor, label

        except Exception as e:
            print(f"❌ __getitem__ エラー at idx={idx}: {e}")
            raise

# ✅ トレーニング関数
def train_model():
    # パス設定
    annotation_dir = "/home/ollo/videomae-clean"
    video_root = "/srv/shared/data/ego4d/short_clips/verb_annotation_simple"
    checkpoint_path = "/home/ollo/VideoMAE/checkpoints/vit_b_hybrid_pt_800e_k710_ft.pth"

    train_json = os.path.join(annotation_dir, "20250512_annotations_train.json")
    val_json = os.path.join(annotation_dir, "20250512_annotations_val.json")

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    # DataLoader
    train_dataset = Ego4DDataset(train_json, video_root, transform)
    val_dataset = Ego4DDataset(val_json, video_root, transform)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4)

    # ✅ モデル構築（num_classes=58）
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = vit_base_patch16_224(all_frames=16, img_size=224, use_checkpoint=True, num_classes=58)
    model.to(device)

    # ✅ Meta tensor チェック（任意）
    for name, param in model.named_parameters():
        if param.device.type == 'meta':
            print(f"❌ Meta tensor detected: {name}")

    # ✅ チェックポイント読み込み（head 無視）
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    if "module" in checkpoint:
        state_dict = checkpoint["module"]
    elif "model" in checkpoint:
        state_dict = checkpoint["model"]
    else:
        state_dict = checkpoint

    new_state_dict = {k: v for k, v in state_dict.items() if not k.startswith("head.")}
    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
    print("🔍 missing keys:", missing_keys)
    print("🔍 unexpected keys:", unexpected_keys)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    criterion = nn.CrossEntropyLoss()
    num_epochs = 5

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for videos, labels in train_loader:
            videos, labels = videos.to(device), labels.to(device)

            if labels.min() < 0 or labels.max() >= 58:
                print(f"❌ 不正なラベル検出: {labels}")
                continue

            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"🔁 Epoch {epoch+1}/{num_epochs} | Train Loss: {total_loss:.4f}")

        # ✅ 検証
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for videos, labels in val_loader:
                videos, labels = videos.to(device), labels.to(device)
                outputs = model(videos)
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        acc = 100.0 * correct / total
        print(f"✅ Val Accuracy: {acc:.2f}%")

    # ✅ モデル保存
    save_path = os.path.join(annotation_dir, "finetuned_model.pth")
    torch.save(model.state_dict(), save_path)
    print(f"💾 モデル保存完了: {save_path}")

# ✅ 実行
if __name__ == "__main__":
    train_model()


🔍 missing keys: ['head.weight', 'head.bias']
🔍 unexpected keys: []


KeyboardInterrupt: 