In [None]:
from decord import VideoReader
import numpy as np
import torch
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel

device = "cuda" if torch.cuda.is_available() else "cpu"

# load pretrained processor, tokenizer, and model
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = VisionEncoderDecoderModel.from_pretrained("Neleac/timesformer-gpt2-video-captioning").to(device)

In [52]:


# load video
video_path = "videos/output_5000_5149.mp4"
container = VideoReader(video_path)

# extract evenly spaced frames from video
clip_len = model.config.encoder.num_frames
frames = container.get_batch(range(0, len(container) // 2, len(container) // (1 * clip_len))).asnumpy()
frames = [frame for frame in frames[:-1]]
# frames_batch = frames_batch[:-1]

# generate caption
gen_kwargs = {
    "min_length": 10, 
    "max_length": 20, 
    "num_beams": 8,
}
pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(device)
tokens = model.generate(pixel_values, **gen_kwargs)
caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
print(caption) 

A group of children are learning how to walk on a mat.


In [4]:
# cut video into 10-second 

from loguru import logger
from decord import VideoReader
import cv2

vr = VideoReader("videoplayback.mp4")
frame_width, frame_height = vr[0].shape[1], vr[0].shape[0]
fps = vr.get_avg_fps()
logger.info(f"Frame width: {frame_width}, Frame height: {frame_height}, FPS: {fps}")
# this is the efficient way to obtain a long list of frames

def write_video(frames, file_name):
    frame_height, frame_width = frames[0].shape[:2]
    out = cv2.VideoWriter(file_name, cv2.VideoWriter_fourcc(*'h264'), int(fps), (frame_width, frame_height))
    for frame in frames:
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        out.write(frame)
    out.release()


for start_frame in range(5000, 6000, int(fps * 5)):
    end_frame = min(start_frame + int(fps * 5), len(vr))
    frames = vr.get_batch(range(start_frame, end_frame))
    # write frames to a video file
    frames = frames.asnumpy()
    write_video(frames, f"videos/output_{start_frame}_{end_frame}.mp4")


[32m2024-05-23 16:23:36.514[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mFrame width: 1280, Frame height: 720, FPS: 29.97002997002997[0m
