In [1]:
import tensorflow as tf
import cv2
import numpy as np
from transformers import ViTFeatureExtractor, TFAutoModel

In [2]:
# Avoid OOM errors by setting GPU Memory Consumption Growth
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
print(gpus)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# Load pre-trained Vision Transformer model and feature extractor
model_name = "google/vit-base-patch16-224-in21k"
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)

All PyTorch model weights were used when initializing TFViTModel.

All the weights of TFViTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTModel for predictions without further training.


In [4]:
def extract_frames(video_path):
    """Extract frames from the video."""
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()
    return frames

In [5]:
def preprocess_frames(frames):
    """Preprocess frames to the format expected by ViT model."""
    inputs = feature_extractor(images=frames, return_tensors="tf")
    return inputs

In [6]:
def extract_features_in_batches(frames, batch_size=10):
    """Extract features from frames using ViT model in batches."""
    num_frames = len(frames)
    all_features = []
    for start in range(0, num_frames, batch_size):
        end = min(start + batch_size, num_frames)
        batch_frames = frames[start:end]
        inputs = preprocess_frames(batch_frames)
        outputs = model(inputs['pixel_values'])
        all_features.append(outputs.last_hidden_state)
    return tf.concat(all_features, axis=0)

In [7]:
video_path = 'doraemon.mp4'
frames = extract_frames(video_path)

In [8]:
features = extract_features_in_batches(frames)

In [17]:
FEATURES = features

In [9]:
print(features.shape)

(737, 197, 768)


In [None]:
# 1 -> 197 
#       1 -> [768]

In [10]:
type(features)

tensorflow.python.framework.ops.EagerTensor

In [11]:
from transformers import TFBertModel, BertConfig

In [22]:
# Load a pretrained Vision Transformer (ViT) model
model_name = "google/vit-base-patch16-224-in21k"
transformer_model = TFAutoModel.from_pretrained(model_name)

All PyTorch model weights were used when initializing TFViTModel.

All the weights of TFViTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTModel for predictions without further training.


In [29]:
# Add batch dimension to the feature tensor
new_features = tf.expand_dims(features, axis=0)  # Shape: (1, 737, 197, 768)

In [30]:
new_features.shape

TensorShape([1, 737, 197, 768])

In [31]:
# Reshape features to match transformer input (batch_size, sequence_length, hidden_size)
# new_features = tf.reshape(new_features, (new_features.shape[0], -1, new_features.shape[-1]))  # Shape: (1, 737*197, 768)

In [32]:
new_features.shape

TensorShape([1, 737, 197, 768])

In [33]:
NEW_FEATURES = new_features

In [34]:
# Pass the features through the transformer model
outputs = transformer_model(new_features)

ValueError: Exception encountered when calling layer "patch_embeddings" "                 f"(type TFViTPatchEmbeddings).

Make sure that the channel dimension of the pixel values match with the one set in the configuration.

Call arguments received by layer "patch_embeddings" "                 f"(type TFViTPatchEmbeddings):
  • pixel_values=tf.Tensor(shape=(1, 737, 197, 768), dtype=float32)
  • interpolate_pos_encoding=None
  • training=False