In [2]:
import os
import sys
import json
import torch
import cv2
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

# CUDA同期エラー特定
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# VideoMAE モデルのパス
sys.path.append("/home/ollo/VideoMAE")
from modeling_finetune import vit_base_patch16_224

# ✅ Ego4D Dataset
class Ego4DDataset(Dataset):
    def __init__(self, annotation_file, video_root, transform=None, num_frames=16):
        with open(annotation_file, "r") as f:
            data = json.load(f)

        self.annotations = data["annotations"]
        self.video_root = video_root
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        try:
            item = self.annotations[idx]
            video_path = os.path.join(self.video_root, item["video_url"])

            if not os.path.exists(video_path):
                raise FileNotFoundError(f"動画ファイルが見つかりません: {video_path}")

            label = int(item["label"][0])
            if not (0 <= label < 58):
                raise ValueError(f"❌ 無効なラベル値: {label}（範囲外）")

            cap = cv2.VideoCapture(video_path)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            frames = []

            for i in range(self.num_frames):
                frame_id = int(i * total_frames / self.num_frames)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
                ret, frame = cap.read()
                if not ret:
                    print(f"⚠️ フレーム取得失敗: {frame_id} @ {video_path}")
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                if self.transform:
                    frame = self.transform(frame)
                frames.append(frame)
            cap.release()

            if not frames:
                raise RuntimeError(f"❌ フレームが取得できません: {video_path}")

            while len(frames) < self.num_frames:
                frames.append(frames[-1])

            video_tensor = torch.stack(frames).permute(1, 0, 2, 3)  # [C, T, H, W]
            return video_tensor, label

        except Exception as e:
            print(f"❌ __getitem__ エラー at idx={idx}: {e}")
            raise

# ✅ トレーニング関数
def train_model():
    annotation_dir = "/home/ollo/videomae-clean"
    video_root = "/srv/shared/data/ego4d/short_clips/verb_annotation_simple"
    checkpoint_path = "/home/ollo/VideoMAE/checkpoints/vit_b_hybrid_pt_800e_k710_ft.pth"
    train_json = os.path.join(annotation_dir, "20250512_annotations_train.json")
    val_json = os.path.join(annotation_dir, "20250512_annotations_val.json")

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    train_dataset = Ego4DDataset(train_json, video_root, transform)
    val_dataset = Ego4DDataset(val_json, video_root, transform)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = vit_base_patch16_224(
        all_frames=16,
        img_size=224,
        use_checkpoint=True,
        num_classes=58,
        use_flash_attn=True  # ✅ Flash Attention 有効（modeling_finetune.pyで対応している前提）
    )
    model.to(device)

    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    state_dict = checkpoint.get("module") or checkpoint.get("model") or checkpoint

    new_state_dict = {k: v for k, v in state_dict.items() if not k.startswith("head.")}
    model.load_state_dict(new_state_dict, strict=False)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()  # ✅ AMPスケーラー

    num_epochs = 5

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        loop = tqdm(train_loader, desc=f"🚂 Epoch {epoch+1}/{num_epochs} [Train]", leave=False)
        for videos, labels in loop:
            videos, labels = videos.to(device), labels.to(device)

            if labels.min() < 0 or labels.max() >= 58:
                print(f"❌ 不正なラベル検出: {labels}")
                continue

            optimizer.zero_grad()
            with autocast():  # ✅ 混合精度
                outputs = model(videos)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"🔁 Epoch {epoch+1}/{num_epochs} | Train Loss: {total_loss:.4f}")

        # ✅ 検証
        model.eval()
        correct, total = 0, 0
        loop = tqdm(val_loader, desc=f"🧪 Epoch {epoch+1}/{num_epochs} [Val]", leave=False)
        with torch.no_grad():
            for videos, labels in loop:
                videos, labels = videos.to(device), labels.to(device)
                with autocast():
                    outputs = model(videos)
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
                loop.set_postfix(acc=f"{100.0 * correct / total:.2f}%")

        acc = 100.0 * correct / total
        print(f"✅ Val Accuracy: {acc:.2f}%")

    # ✅ モデル保存
    save_path = os.path.join(annotation_dir, "finetuned_model_amp_flash.pth")
    torch.save(model.state_dict(), save_path)
    print(f"💾 モデル保存完了: {save_path}")

# ✅ 実行
if __name__ == "__main__":
    train_model()


TypeError: VisionTransformer.__init__() got an unexpected keyword argument 'use_flash_attn'

In [4]:
import os
import sys
import json
import torch
import cv2
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

# CUDA同期エラー特定
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# VideoMAE モデルのパス
sys.path.append("/home/ollo/VideoMAE")
from modeling_finetune import vit_base_patch16_224

# ✅ Ego4D Dataset
class Ego4DDataset(Dataset):
    def __init__(self, annotation_file, video_root, transform=None, num_frames=16):
        with open(annotation_file, "r") as f:
            data = json.load(f)

        self.annotations = data["annotations"]
        self.video_root = video_root
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        try:
            item = self.annotations[idx]
            video_path = os.path.join(self.video_root, item["video_url"])

            if not os.path.exists(video_path):
                raise FileNotFoundError(f"動画ファイルが見つかりません: {video_path}")

            label = int(item["label"][0])
            if not (0 <= label < 58):
                raise ValueError(f"❌ 無効なラベル値: {label}（範囲外）")

            cap = cv2.VideoCapture(video_path)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            frames = []

            for i in range(self.num_frames):
                frame_id = int(i * total_frames / self.num_frames)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
                ret, frame = cap.read()
                if not ret:
                    print(f"⚠️ フレーム取得失敗: {frame_id} @ {video_path}")
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                if self.transform:
                    frame = self.transform(frame)
                frames.append(frame)
            cap.release()

            if not frames:
                raise RuntimeError(f"❌ フレームが取得できません: {video_path}")

            while len(frames) < self.num_frames:
                frames.append(frames[-1])

            video_tensor = torch.stack(frames).permute(1, 0, 2, 3)  # [C, T, H, W]
            return video_tensor, label

        except Exception as e:
            print(f"❌ __getitem__ エラー at idx={idx}: {e}")
            raise

# ✅ トレーニング関数
def train_model():
    annotation_dir = "/home/ollo/videomae-clean"
    video_root = "/srv/shared/data/ego4d/short_clips/verb_annotation_simple"
    checkpoint_path = "/home/ollo/VideoMAE/checkpoints/vit_b_hybrid_pt_800e_k710_ft.pth"
    train_json = os.path.join(annotation_dir, "20250512_annotations_train.json")
    val_json = os.path.join(annotation_dir, "20250512_annotations_val.json")

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    train_dataset = Ego4DDataset(train_json, video_root, transform)
    val_dataset = Ego4DDataset(val_json, video_root, transform)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = vit_base_patch16_224(
        all_frames=16,
        img_size=224,
        use_checkpoint=True,
        num_classes=58
        # ❌ use_flash_attn は削除（未使用）
    )
    model.to(device)

    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    state_dict = checkpoint.get("module") or checkpoint.get("model") or checkpoint
    new_state_dict = {k: v for k, v in state_dict.items() if not k.startswith("head.")}
    model.load_state_dict(new_state_dict, strict=False)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()  # ✅ AMPスケーラー

    num_epochs = 5

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        loop = tqdm(train_loader, desc=f"🚂 Epoch {epoch+1}/{num_epochs} [Train]", leave=False)
        for videos, labels in loop:
            videos, labels = videos.to(device), labels.to(device)

            if labels.min() < 0 or labels.max() >= 58:
                print(f"❌ 不正なラベル検出: {labels}")
                continue

            optimizer.zero_grad()
            with autocast():  # ✅ 混合精度
                outputs = model(videos)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"🔁 Epoch {epoch+1}/{num_epochs} | Train Loss: {total_loss:.4f}")

        # ✅ 検証
        model.eval()
        correct, total = 0, 0
        loop = tqdm(val_loader, desc=f"🧪 Epoch {epoch+1}/{num_epochs} [Val]", leave=False)
        with torch.no_grad():
            for videos, labels in loop:
                videos, labels = videos.to(device), labels.to(device)
                with autocast():
                    outputs = model(videos)
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
                loop.set_postfix(acc=f"{100.0 * correct / total:.2f}%")

        acc = 100.0 * correct / total
        print(f"✅ Val Accuracy: {acc:.2f}%")

    # ✅ モデル保存
    save_path = os.path.join(annotation_dir, "finetuned_model_amp.pth")
    torch.save(model.state_dict(), save_path)
    print(f"💾 モデル保存完了: {save_path}")

# ✅ 実行
if __name__ == "__main__":
    train_model()


  return F.conv3d(
                                                                                       

KeyboardInterrupt: 