In [7]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [10]:
import sys
import os
import torch
import json
import cv2

# CUDA同期エラー出力を有効化（重要）
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# ✅ パス追加（Jupyter用なら明示的に）
# source_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
source_dir = "/home/ollo/VideoMAE"  # あなたの環境に応じて調整
sys.path.append(source_dir)

from modeling_finetune import vit_base_patch16_224


def load_model():
    checkpoint_path = "/home/ollo/VideoMAE/checkpoints/vit_b_hybrid_pt_800e_k710_ft.pth"

    print("🔍 Checkpoint を読み込み中...")
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    print(f"✅ 読み込み成功。キー: {list(checkpoint.keys())}")

    state_dict = checkpoint["module"]

    print("🔧 モデル構築中...")
    model = vit_base_patch16_224(
        all_frames=16,
        img_size=224,
        use_checkpoint=True,
        num_classes=57
    )
    print("✅ モデル構築完了")

    # 出力層（head）の重みが不要なので削除
    for del_key in ["head.weight", "head.bias"]:
        if del_key in state_dict:
            del state_dict[del_key]
            print(f"⚠️ {del_key} を削除")

    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    print("✅ 重み読み込み完了")
    print("  missing_keys:", missing_keys)
    print("  unexpected_keys:", unexpected_keys)

    # 重みの差分チェック（任意）
    for key in model.state_dict():
        if "head" in key:
            continue
        try:
            diff = (model.state_dict()[key] - state_dict[key]).abs().sum()
            print(f"{key}: 差分 {diff}")
        except Exception:
            print(f"{key}: 差分計算スキップ（初期化済み）")

    # ✅ CPUで forward 確認してから GPU に送る
    model.eval()
    dummy = torch.randn(1, 3, 16, 224, 224)
    try:
        out = model(dummy)
        print(f"✅ CPU forward 成功 → 出力 shape: {out.shape}")
    except Exception as e:
        print(f"❌ CPU forward 失敗: {e}")
        return

    # ✅ GPU に安全に送る（壊れた重みがないかチェック）
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    try:
        model.to(device)
        print(f"✅ model.to({device}) 成功")
    except Exception as e:
        print(f"❌ model.to({device}) 失敗: {e}")
        return

    # ✅ 最終テスト：GPU上で forward/backward
    _input = torch.randn(1, 3, 16, 224, 224).to(device)
    try:
        _output = model(_input)
        loss = _output.sum()
        loss.backward()
        print("✅ GPU上で forward/backward 成功")
        print(f"出力 shape: {_output.shape}")
    except Exception as e:
        print("❌ GPU上で forward/backward 失敗:", e)


if __name__ == "__main__":
    load_model()
    # check_annotation_video()  # ← 必要な時だけ有効化


🔍 Checkpoint を読み込み中...
✅ 読み込み成功。キー: ['module']
🔧 モデル構築中...
✅ モデル構築完了
⚠️ head.weight を削除
⚠️ head.bias を削除
✅ 重み読み込み完了
  missing_keys: ['head.weight', 'head.bias']
  unexpected_keys: []
patch_embed.proj.weight: 差分 0.0
patch_embed.proj.bias: 差分 0.0
blocks.0.norm1.weight: 差分 0.0
blocks.0.norm1.bias: 差分 0.0
blocks.0.attn.q_bias: 差分 0.0
blocks.0.attn.v_bias: 差分 0.0
blocks.0.attn.qkv.weight: 差分 0.0
blocks.0.attn.proj.weight: 差分 0.0
blocks.0.attn.proj.bias: 差分 0.0
blocks.0.norm2.weight: 差分 0.0
blocks.0.norm2.bias: 差分 0.0
blocks.0.mlp.fc1.weight: 差分 0.0
blocks.0.mlp.fc1.bias: 差分 0.0
blocks.0.mlp.fc2.weight: 差分 0.0
blocks.0.mlp.fc2.bias: 差分 0.0
blocks.1.norm1.weight: 差分 0.0
blocks.1.norm1.bias: 差分 0.0
blocks.1.attn.q_bias: 差分 0.0
blocks.1.attn.v_bias: 差分 0.0
blocks.1.attn.qkv.weight: 差分 0.0
blocks.1.attn.proj.weight: 差分 0.0
blocks.1.attn.proj.bias: 差分 0.0
blocks.1.norm2.weight: 差分 0.0
blocks.1.norm2.bias: 差分 0.0
blocks.1.mlp.fc1.weight: 差分 0.0
blocks.1.mlp.fc1.bias: 差分 0.0
blocks.1.mlp.fc2.weight

In [11]:
import os
import sys
import json
import torch
import cv2
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim

# CUDA同期エラー特定（重要）
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# VideoMAE モデルのパスを追加
sys.path.append("/home/ollo/VideoMAE")
from modeling_finetune import vit_base_patch16_224

# ✅ Ego4D Dataset
class Ego4DDataset(Dataset):
    def __init__(self, annotation_file, video_root, transform=None, num_frames=16):
        with open(annotation_file, "r") as f:
            data = json.load(f)

        self.annotations = data["annotations"]
        self.video_root = video_root
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        item = self.annotations[idx]
        video_path = os.path.join(self.video_root, item["video_url"])
        label = item["label"][0]  # 単一ラベル前提

        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frames = []

        for i in range(self.num_frames):
            frame_id = int(i * total_frames / self.num_frames)
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)
        cap.release()

        while len(frames) < self.num_frames:
            frames.append(frames[-1])  # 最後のフレームで埋める

        video_tensor = torch.stack(frames).permute(1, 0, 2, 3)  # [C, T, H, W]
        return video_tensor, label

# ✅ トレーニング関数
def train_model():
    # パス設定
    annotation_dir = "/home/ollo/videomae-clean"
    video_root = "/srv/shared/data/ego4d/short_clips/verb_annotation_simple"
    checkpoint_path = "/home/ollo/VideoMAE/checkpoints/vit_b_hybrid_pt_800e_k710_ft.pth"

    train_json = os.path.join(annotation_dir, "20250512_annotations_train.json")
    val_json = os.path.join(annotation_dir, "20250512_annotations_val.json")

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    # DataLoader
    train_dataset = Ego4DDataset(train_json, video_root, transform)
    val_dataset = Ego4DDataset(val_json, video_root, transform)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4)

    # モデル構築 + 重み読み込み
    model = vit_base_patch16_224(all_frames=16, img_size=224, use_checkpoint=True, num_classes=57)
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    state_dict = checkpoint["module"]
    for k in ["head.weight", "head.bias"]:
        if k in state_dict:
            del state_dict[k]
    model.load_state_dict(state_dict, strict=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    criterion = nn.CrossEntropyLoss()

    num_epochs = 5  # エポック数調整可能

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for videos, labels in train_loader:
            videos, labels = videos.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"🔁 Epoch {epoch+1}/{num_epochs} | Train Loss: {total_loss:.4f}")

        # 検証
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for videos, labels in val_loader:
                videos, labels = videos.to(device), labels.to(device)
                outputs = model(videos)
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        acc = 100.0 * correct / total
        print(f"✅ Val Accuracy: {acc:.2f}%")

    # モデル保存
    save_path = os.path.join(annotation_dir, "finetuned_model.pth")
    torch.save(model.state_dict(), save_path)
    print(f"💾 モデル保存完了: {save_path}")

# ✅ 実行
if __name__ == "__main__":
    train_model()


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [13]:
import os
import sys
import json
import torch
import cv2
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim

# CUDA同期エラー特定（重要）
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# VideoMAE モデルのパスを追加
sys.path.append("/home/ollo/VideoMAE")
from modeling_finetune import vit_base_patch16_224

# ✅ Ego4D Dataset
class Ego4DDataset(Dataset):
    def __init__(self, annotation_file, video_root, transform=None, num_frames=16):
        with open(annotation_file, "r") as f:
            data = json.load(f)

        self.annotations = data["annotations"]
        self.video_root = video_root
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        try:
            item = self.annotations[idx]
            video_path = os.path.join(self.video_root, item["video_url"])

            if not os.path.exists(video_path):
                raise FileNotFoundError(f"動画ファイルが見つかりません: {video_path}")

            label = int(item["label"][0])
            if not (0 <= label < 57):
                raise ValueError(f"❌ 無効なラベル値: {label}（範囲外）")

            cap = cv2.VideoCapture(video_path)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            frames = []

            for i in range(self.num_frames):
                frame_id = int(i * total_frames / self.num_frames)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
                ret, frame = cap.read()
                if not ret:
                    print(f"⚠️ フレーム取得失敗: {frame_id} @ {video_path}")
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                if self.transform:
                    frame = self.transform(frame)
                frames.append(frame)
            cap.release()

            if not frames:
                raise RuntimeError(f"❌ フレームが取得できません: {video_path}")

            while len(frames) < self.num_frames:
                frames.append(frames[-1])  # 最後のフレームで埋める

            video_tensor = torch.stack(frames).permute(1, 0, 2, 3)  # [C, T, H, W]
            return video_tensor, label

        except Exception as e:
            print(f"❌ __getitem__ エラー at idx={idx}: {e}")
            raise

# ✅ トレーニング関数
def train_model():
    # パス設定
    annotation_dir = "/home/ollo/videomae-clean"
    video_root = "/srv/shared/data/ego4d/short_clips/verb_annotation_simple"
    checkpoint_path = "/home/ollo/VideoMAE/checkpoints/vit_b_hybrid_pt_800e_k710_ft.pth"

    train_json = os.path.join(annotation_dir, "20250512_annotations_train.json")
    val_json = os.path.join(annotation_dir, "20250512_annotations_val.json")

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    # DataLoader
    train_dataset = Ego4DDataset(train_json, video_root, transform)
    val_dataset = Ego4DDataset(val_json, video_root, transform)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4)

    # ✅ モデル構築（num_classes=57）
    model = vit_base_patch16_224(all_frames=16, img_size=224, use_checkpoint=True, num_classes=57)

    # ✅ チェックポイント読み込み
    checkpoint = torch.load(checkpoint_path, map_location="cpu")

    # ✅ 柔軟に state_dict 抽出
    if "module" in checkpoint:
        state_dict = checkpoint["module"]
    elif "model" in checkpoint:
        state_dict = checkpoint["model"]
    else:
        state_dict = checkpoint

    # ✅ head 層は読み込まず（ランダム初期化で再学習）
    new_state_dict = {}
    for k, v in state_dict.items():
        if not k.startswith("head."):
            new_state_dict[k] = v

    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
    print("🔍 missing keys:", missing_keys)
    print("🔍 unexpected keys:", unexpected_keys)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"📦 Moving model to {device}...")
    model.to(device)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    criterion = nn.CrossEntropyLoss()

    num_epochs = 5

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for videos, labels in train_loader:
            videos, labels = videos.to(device), labels.to(device)

            if labels.min() < 0 or labels.max() >= 57:
                print(f"❌ 不正なラベル検出: {labels}")
                continue

            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"🔁 Epoch {epoch+1}/{num_epochs} | Train Loss: {total_loss:.4f}")

        # 検証
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for videos, labels in val_loader:
                videos, labels = videos.to(device), labels.to(device)
                outputs = model(videos)
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        acc = 100.0 * correct / total
        print(f"✅ Val Accuracy: {acc:.2f}%")

    # モデル保存
    save_path = os.path.join(annotation_dir, "finetuned_model.pth")
    torch.save(model.state_dict(), save_path)
    print(f"💾 モデル保存完了: {save_path}")

# ✅ 実行
if __name__ == "__main__":
    train_model()


🔍 missing keys: ['head.weight', 'head.bias']
🔍 unexpected keys: []
📦 Moving model to cuda...


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
