In [11]:
import numpy as np
from decord import VideoReader, cpu, gpu
import torch

from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification_my_model
from huggingface_hub import hf_hub_download


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
vr = VideoReader(file_path, num_threads=1, ctx=gpu(0))

# sample 16 frames
vr.seek(0)
indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(vr))
buffer = vr.get_batch(indices).asnumpy()

# create a list of NumPy arrays
video = [buffer[i] for i in range(buffer.shape[0])]

feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
model = VideoMAEForVideoClassification_my_model.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

inputs = feature_extractor(video, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# model predicts one of the 400 Kinetics-400 classes
predicted_label = logits.argmax(-1).item()
print(model.config.id2label[predicted_label])

DECORDError: [06:29:53] /github/workspace/src/video/video_reader.cc:167: CUDA not enabled. Requested context GPU(0).

In [1]:
import numpy as np
from decord import VideoReader, cpu
import torch

from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification_my_model
from huggingface_hub import hf_hub_download


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))

# sample 16 frames
vr.seek(0)
indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(vr))
buffer = vr.get_batch(indices).asnumpy()

# create a list of NumPy arrays
video = [buffer[i] for i in range(buffer.shape[0])]

feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
model = VideoMAEForVideoClassification_my_model.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

inputs = feature_extractor(video, return_tensors="pt")
with torch.no_grad():
    feat = outputs = model(**inputs)

Some weights of the model checkpoint at MCG-NJU/videomae-base-finetuned-kinetics were not used when initializing VideoMAEForVideoClassification_my_model: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing VideoMAEForVideoClassification_my_model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VideoMAEForVideoClassification_my_model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
torch.Tensor(val)

tensor([[-3.8003e-02, -2.8643e-01,  3.2709e-01, -1.8406e-01, -2.9160e-01,
          1.4196e-01,  1.9110e-01, -1.9731e-01, -4.7445e-01, -1.6631e-01,
         -3.8745e-01, -3.7608e-01, -4.8089e-02,  8.8254e-02,  6.9711e-02,
          4.9308e-01,  2.9021e-01, -3.5002e-01, -1.1945e-01, -4.0916e-01,
         -4.3320e-01,  3.9060e-01, -7.4332e-01, -1.1595e-01, -1.3681e-01,
          1.9175e-01, -1.3013e-01, -1.8015e-01,  1.0839e-01, -7.1339e-01,
          1.8250e-01, -3.3422e-01,  2.7725e-01,  9.4549e-02, -4.1194e-01,
         -2.9150e-01, -1.9264e-01,  3.4785e-01, -6.1796e-01,  2.6410e-01,
          1.7060e-01,  1.1051e-01,  2.5019e-01,  2.5462e-01,  4.2740e-01,
          3.1047e-01,  3.8930e-01, -1.8246e-02, -7.0095e-01, -1.5145e-01,
         -1.1871e-01,  4.9051e-01, -1.6864e-01,  2.0900e-01, -3.4250e-01,
          5.4452e-01, -5.2436e-01,  2.9382e-01,  3.9429e-01,  5.5121e-01,
         -9.3023e-02, -2.8334e-01,  6.4558e-01,  3.3604e-01,  1.7834e-01,
         -3.0248e-01, -6.9875e-02,  2.