In [1]:
from InceptionI3d import InceptionI3d
import cv2
import glob
import h5py
import os
from tqdm import tqdm
import torch
import torchvision.transforms as transforms
from PIL import Image

In [2]:
class args:
    num_features = 400
    num_features_logits = 1024
    max_frames = 20
    

i3d = InceptionI3d(400, in_channels=3)
i3d.load_state_dict(torch.load('rgb_imagenet.pt'))
i3d.cuda()
print()




In [3]:
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
data_transform_i3d = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)])

def extract_frames(video_path, fps=1, max_frames=20, clip_len=16):
    cap = cv2.VideoCapture(video_path)
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(video_fps // fps) if video_fps > 0 else 1

    # Load all frames
    all_frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        all_frames.append(Image.fromarray(frame))
    cap.release()

    total_frames = len(all_frames)
    if total_frames < clip_len:
        print("⚠️ Video too short for motion extraction.")
        return None
    
    motion_clips = []
    start_indices = list(range(0, total_frames - clip_len + 1, frame_interval))

    for start in start_indices:
        if len(motion_clips) >= max_frames:
            break
        clip = all_frames[start: start + clip_len]
        if len(clip) < clip_len:
            continue
        clip_tensor = torch.stack([data_transform_i3d(f) for f in clip], dim=1)  # (3, 16, H, W)
        motion_clips.append(clip_tensor)

    if len(motion_clips) == 0:
        print("⚠️ Not enough clips extracted.")
        return None

    return torch.stack(motion_clips) 

In [4]:
video_paths = glob.glob(os.path.join('../../Dataset/MSVD/raw/', '*.avi'))
with h5py.File('../data/MSVD/features/MSVD_motion_clip.hdf5', 'w') as f:
    for video_path in tqdm(video_paths, desc="Extracting features"):
        video_name = os.path.splitext(os.path.basename(video_path))[0]
        frames = extract_frames(video_path)
        motion_features = []
        for i in range(frames.shape[0]):
            feat = i3d.extract_features(frames[i].unsqueeze(0).cuda()).cpu().detach()
            motion_features.append(feat.squeeze())
        motion_features = torch.stack(motion_features)        
        f.create_dataset(video_name, data=motion_features)

Extracting features: 100%|██████████| 1970/1970 [45:19<00:00,  1.38s/it] 
