In [1]:
ls /home/ollo/videomae-clean/serch/modeling_finetune.py


ls: cannot access '/home/ollo/videomae-clean/serch/modeling_finetune.py': No such file or directory


In [4]:
!pip install av




In [1]:
import os
import sys
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.io as io
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

# ✅ CUDAデバッグ用設定（必要に応じてオフにしてください）
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# ✅ VideoMAEのパスをsys.pathに追加
video_mae_path = '/home/ollo/VideoMAE'
if video_mae_path not in sys.path:
    sys.path.append(video_mae_path)
    print(f"PYTHONPATHに追加: {video_mae_path}")

# ✅ VideoMAEのモデルをインポート
from modeling_finetune import vit_base_patch16_224

# ✅ Ego4D用 Dataset（FlashAttention対応）
class Ego4DFlashDataset(Dataset):
    def __init__(self, annotation_file, video_root, transform=None, num_frames=16, num_classes=58):
        with open(annotation_file, "r") as f:
            data = json.load(f)
        self.annotations = data["annotations"]
        self.video_root = video_root
        self.transform = transform
        self.num_frames = num_frames
        self.num_classes = num_classes

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        video_path = os.path.join(self.video_root, ann["video_url"])
        if not os.path.exists(video_path):
            raise FileNotFoundError(f"❌ 動画ファイルが見つかりません: {video_path}")

        video, _, _ = io.read_video(video_path, pts_unit='sec')  # [T, H, W, C]
        T = video.shape[0]
        if T < self.num_frames:
            repeat_factor = (self.num_frames + T - 1) // T
            video = video.repeat(repeat_factor, 1, 1, 1)

        indices = torch.linspace(0, T - 1, self.num_frames).long()
        video = video[indices]  # [T, H, W, C]
        video = video.permute(0, 3, 1, 2).float() / 255.0  # [T, C, H, W]

        if self.transform:
            video = torch.stack([self.transform(frame) for frame in video])  # [T, C, H, W]

        video = video.permute(1, 0, 2, 3)  # [C, T, H, W]

        label = ann["label"]
        if not isinstance(label, list):
            label = [label]
        target = torch.zeros(self.num_classes)
        for l in label:
            if 0 <= l < self.num_classes:
                target[l] = 1.0

        return video, target

# ✅ トレーニング関数
def train_model():
    annotation_dir = "/home/ollo/videomae-clean"
    video_root = "/srv/shared/data/ego4d/short_clips/verb_annotation_simple"
    checkpoint_path = "/home/ollo/VideoMAE/checkpoints/vit_b_hybrid_pt_800e_k710_ft.pth"
    train_json = os.path.join(annotation_dir, "20250512_annotations_train.json")
    val_json = os.path.join(annotation_dir, "20250512_annotations_val.json")

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
    ])

    train_dataset = Ego4DFlashDataset(train_json, video_root, transform)
    val_dataset = Ego4DFlashDataset(val_json, video_root, transform)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4, pin_memory=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = vit_base_patch16_224(
        all_frames=16,
        img_size=224,
        use_checkpoint=True,
        num_classes=58
    ).to(device)

    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    state_dict = checkpoint.get("module") or checkpoint.get("model") or checkpoint
    new_state_dict = {k: v for k, v in state_dict.items() if not k.startswith("head.")}
    model.load_state_dict(new_state_dict, strict=False)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    criterion = nn.BCEWithLogitsLoss()
    scaler = GradScaler()

    num_epochs = 5

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        loop = tqdm(train_loader, desc=f"🚂 Epoch {epoch+1}/{num_epochs} [Train]", leave=False)
        for videos, targets in loop:
            videos, targets = videos.to(device), targets.to(device)

            optimizer.zero_grad()
            with autocast():
                outputs = model(videos)
                loss = criterion(outputs, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"[Epoch {epoch+1}] 📉 Train Loss: {total_loss:.4f}")

        # 検証
        model.eval()
        correct, total = 0, 0
        loop = tqdm(val_loader, desc=f"🧪 Epoch {epoch+1}/{num_epochs} [Val]", leave=False)
        with torch.no_grad():
            for videos, targets in loop:
                videos, targets = videos.to(device), targets.to(device)
                with autocast():
                    outputs = model(videos)
                preds = (torch.sigmoid(outputs) > 0.5).float()
                correct += (preds == targets).sum().item()
                total += targets.numel()
                loop.set_postfix(acc=f"{100.0 * correct / total:.2f}%")

        acc = 100.0 * correct / total
        print(f"[Epoch {epoch+1}] ✅ Val Accuracy: {acc:.2f}%")

    save_path = os.path.join(annotation_dir, "videomae_finetuned_flash.pth")
    torch.save(model.state_dict(), save_path)
    print(f"💾 モデル保存完了: {save_path}")

# ✅ 実行ブロック
if __name__ == "__main__":
    train_model()


PYTHONPATHに追加: /home/ollo/VideoMAE


  from .autonotebook import tqdm as notebook_tqdm
  @register_model
  @register_model
  @register_model
  @register_model
  @register_model
  return F.conv3d(
                                                                                      

KeyboardInterrupt: 