In [40]:
import json
import os
import random

import torch
from torchvision.io import VideoReader
from transformers import AutoTokenizer, AutoProcessor, AutoModel, AutoModelForCausalLM

In [9]:
class VatexDataset(torch.utils.data.Dataset):
    def __init__(self, videos_path, json_path, transforms=None):
        self.videos_path = videos_path
        
        self.video_names = []
        for _, _, files in os.walk(videos_path):
            for file in files:
                self.video_names.append(file)
        
        self.video_captions = {}
        with open(json_path) as json_file:
            json_data = json.load(json_file)            
            for data in json_data:
                video_id = data["videoID"]
                captions = data["enCap"]
                self.video_captions[video_id] = captions
                
        self.transforms = transforms
        
                
    def __len__(self):
        return len(self.video_names) * 10
    
    
    def __getitem__(self, idx):     
        video_idx = idx % len(self.video_names)
        caption_idx = idx // len(self.video_names)
        
        video_name = self.video_names[video_idx]
        video_id = video_name.split(".")[0]
        captions = self.video_captions[video_id]
        caption = captions[caption_idx]
        
        video_path = os.path.join(self.videos_path, video_name)
        reader = VideoReader(video_path, "video")
        
        sample_frames = 8
        duration = reader.get_metadata()["video"]["duration"][0]
        step = duration / sample_frames
        
        frames = []
        for i in range(sample_frames):
            timestamp = random.uniform(i * step, (i + 1) * step)
            reader.seek(timestamp)
            frames.append(next(reader)["data"])

        return (frames, caption)

In [10]:
data_train = VatexDataset("dataset/vatex_train_videos", "dataset/vatex_train_captions.json")
data_val = VatexDataset("dataset/vatex_val_videos", "dataset/vatex_val_captions.json")
print(len(data_train))
print(len(data_val))

229750
26500


In [None]:
data_train[0]

In [2]:
video_path = os.path.join("dataset/vatex_train_videos", "__NrybzYzUg_000415_000425.mp4")
reader = VideoReader(video_path, "video")

sample_frames = 8
duration = reader.get_metadata()["video"]["duration"][0]
step = duration / sample_frames

frames = []
for i in range(sample_frames):
    timestamp = random.uniform(i * step, (i + 1) * step)
    reader.seek(timestamp)
    frames.append(next(reader)["data"])

In [41]:
xclip_processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
xclip_model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

gpt_tokenizer = AutoTokenizer.from_pretrained('gpt2')
gpt_model = AutoModelForCausalLM.from_pretrained('gpt2')

In [36]:
inputs = xclip_processor(videos=frames, return_tensors="pt")
video_features = xclip_model.get_video_features(**inputs)
video_features.shape

torch.Size([1, 512])

In [48]:
inputs = gpt_tokenizer("My name Jeff", return_tensors="pt")
output = gpt_model.generate(**inputs, max_length=20)
gpt_tokenizer.decode(output[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"My name Jeff, I'm a professional photographer. I'm a professional photographer. I'm a professional"

In [50]:
inputs = gpt_tokenizer.encode("My name Jeff", return_tensors="pt")
inputs

tensor([[3666, 1438, 5502]])

In [51]:
inputs = gpt_tokenizer("My name Jeff", return_tensors="pt")
inputs

{'input_ids': tensor([[3666, 1438, 5502]]), 'attention_mask': tensor([[1, 1, 1]])}