In [1]:
from tqdm import tqdm
import torch
import h5py
import torch.nn.functional as F

In [2]:
img = {}
motion = {}
with h5py.File('../data/MSVD/features/MSVD_visual_clip.hdf5', 'r') as fs:
    for key in fs.keys():
        img[key] = fs[key][()]

with h5py.File('../data/MSVD/features/MSVD_motion_clip.hdf5', 'r') as fs:
    for key in fs.keys():
        motion[key] = fs[key][()]

In [3]:
max_frame = 20

In [4]:
with h5py.File("../data/MSVD/features/MSVD_edge_clip.hdf5", "w") as fs:
    for video_id in tqdm(img.keys()):

        # =====> VISUAL <=====
        visual_feature = torch.from_numpy(img[video_id])
        num_frames, feat_dim = visual_feature.shape
        pad_len = max_frame - num_frames
        
        if pad_len < 0:
            visual_feature = visual_feature[:max_frame]
        else:
            pad_feat = torch.zeros((pad_len, feat_dim), dtype=visual_feature.dtype)
            visual_feature = torch.cat([visual_feature, pad_feat], dim=0)

        # =====> MOTION <=====
        motion_feature = torch.from_numpy(motion[video_id])
        num_frames, feat_dim = motion_feature.shape
        pad_len = max_frame - num_frames
        
        if pad_len < 0:
            motion_feature = motion_feature[:max_frame]
        else:
            pad_feat = torch.zeros((pad_len, feat_dim), dtype=motion_feature.dtype)
            motion_feature = torch.cat([motion_feature, pad_feat], dim=0)

        # Edge Graph
        cos = F.cosine_similarity(visual_feature[:-1], visual_feature[1:], dim=-1) 
        cos_softmax  = F.softmax(cos, dim=0)
        edge_feats = torch.zeros(max_frame, max_frame, feat_dim)
        idx = torch.arange(max_frame)
        for i in range(max_frame):
            edge_feats[i, idx, idx] = motion_feature[:, i]
            edge_feats[i, idx[:-1], idx[1:]] = cos_softmax
        fs.create_dataset(video_id, data=edge_feats)


100%|██████████| 1970/1970 [00:38<00:00, 50.78it/s] 
