# The VRAM footprint and Time cost of generating an embedding and attention map

In [None]:
import sys
import os
import torch
import numpy as np
from tqdm import tqdm
import time

# Get the absolute path of the project root
project_root = os.path.abspath("..")  # Adjust path if your notebook is deeper in directories

# Add project root to sys.path
sys.path.append(project_root)

os.environ['XFORMERS_DISABLED'] = 'False'
print(os.environ.get("XFORMERS_DISABLED"))

# Now you can import dinov2
import dinov2
from dinov2.eval.setup import build_model_for_eval
from dinov2.configs import load_and_merge_config
from dinov2.utils.visualize import load_and_preprocess_video, print_video_model_stats, get_last_self_attn, get_model_output, get_model_output_vram_efficient

device = "cuda"
device = torch.device(device if torch.cuda.is_available() else "cpu")
print(device)


TARGET_SIZE = 896 # For a 64*64 video, we resize it to 896*896, and since the dinov2 patch size is 14, we will get a 64*64 attention map.

True




cuda


In [2]:
def main(video_path, model, device='cuda'):
    # Load and preprocess video
    print(f"Loading video from {video_path}...")
    video_prenorm, video_normalized, fps = load_and_preprocess_video(
        video_path, 
        target_size=TARGET_SIZE, 
        patch_size=model.patch_size
    )  # 448 is multiples of patch_size (14)
    
    video_normalized = video_normalized[:10]
    # Print video and model stats
    T, C, H, W, patch_size, embedding_dim, patch_num = print_video_model_stats(video_normalized, model)
    H_p, W_p = int(H/patch_size), int(W/patch_size)
    
    # Helper function to measure memory and time
    def measure_execution(name, func, *args, **kwargs):
        # For PyTorch CUDA tensors
        if device.type == 'cuda':
            # Record starting memory
            torch.cuda.synchronize()
            start_mem = torch.cuda.memory_allocated() / (1024 ** 2)  # MB
            start_time = time.time()
            
            # Execute function
            result = func(*args, **kwargs)
            
            # Record ending memory and time
            torch.cuda.synchronize()
            end_time = time.time()
            end_mem = torch.cuda.memory_allocated() / (1024 ** 2)  # MB
            
            # Print results
            print(f"\n{'-'*50}")
            print(f"{name} Performance Metrics:")
            print(f"Time: {(end_time - start_time)*1000:.2f} ms")
            print(f"VRAM: Current usage: {end_mem:.2f} MB")
            print(f"VRAM: Peak increment: {end_mem - start_mem:.2f} MB")
            
            # Try to explicitly free memory for better measurement
            if device == 'cuda':
                torch.cuda.empty_cache()
                
            return result
        
        # For CPU or other devices
        else:
            start_time = time.time()
            result = func(*args, **kwargs)
            print(f"{name} Time: {(time.time() - start_time)*1000:.2f} ms")
            return result
    
    # Measure embeddings generation
    print("\nGenerating embeddings...")
    cls_token_emb, patch_token_embs = measure_execution(
        "Embedding Generation", 
        get_model_output,
        model, 
        video_normalized
    )
    
    # Clear cache between measurements if using GPU
    if device == 'cuda':
        torch.cuda.empty_cache()
    
    # Allow some time between measurements
    time.sleep(1)
    
    # Measure attention map generation
    print("\nGenerating attention maps...")
    last_self_attention = measure_execution(
        "Attention Map Generation", 
        get_last_self_attn,
        model, 
        video_normalized
    )
    
    
    
    
    
    
    
    
    
    # Return all results
    return {
        'video_info': {
            'prenorm': video_prenorm,
            'normalized': video_normalized,
            'fps': fps,
            'dims': (T, C, H, W),
            'patch_size': patch_size,
            'patch_dims': (H_p, W_p)
        },
        'model_output': {
            'cls_token': cls_token_emb,
            'patch_tokens': patch_token_embs,
            'attention': last_self_attention
        },
        'embedding_dim': embedding_dim
    }

In [3]:
model_size = "s"
base_dir = "./data"
# exp_name = "natural"
# video_name = "dog_first5sec"
# video_name = "dog_cat_first5sec"

exp_name = "ms_pacman"
video_name = "ms_pacman"

video_path = f"{base_dir}/{exp_name}/videos/{video_name}.mp4"
# Use `dinov2_vitb14_pretrain`
conf = load_and_merge_config(f'eval/vit{model_size}14_reg4_pretrain')
model = build_model_for_eval(conf, f'../dinov2/checkpoints/dinov2_vit{model_size}14_reg4_pretrain.pth')

main(video_path, model, device)


Loading video from ./data/ms_pacman/videos/ms_pacman.mp4...
Video FPS: 20.00, Total Frames: 128, Duration: 6.40 seconds
Input tensor shape: Batch=10, Channels=3, Height=896, Width=896
Patch size: 14
Embedding dimension: 384
Number of patches of each image: 4096

Generating embeddings...

--------------------------------------------------
Embedding Generation Performance Metrics:
Time: 711.98 ms
VRAM: Current usage: 2444.27 MB
VRAM: Peak increment: 8.12 MB

Generating attention maps...


100%|██████████| 10/10 [00:03<00:00,  3.23it/s]



--------------------------------------------------
Attention Map Generation Performance Metrics:
Time: 5066.27 ms
VRAM: Current usage: 2444.27 MB
VRAM: Peak increment: 0.00 MB


{'video_info': {'prenorm': tensor([[[[0.6824, 0.6824, 0.6824,  ..., 0.7255, 0.7255, 0.7255],
            [0.6824, 0.6824, 0.6824,  ..., 0.7255, 0.7255, 0.7255],
            [0.6824, 0.6824, 0.6824,  ..., 0.7255, 0.7255, 0.7255],
            ...,
            [0.0078, 0.0078, 0.0078,  ..., 0.0118, 0.0118, 0.0118],
            [0.0078, 0.0078, 0.0078,  ..., 0.0118, 0.0118, 0.0118],
            [0.0078, 0.0078, 0.0078,  ..., 0.0118, 0.0118, 0.0118]],
  
           [[0.3059, 0.3059, 0.3059,  ..., 0.2941, 0.2941, 0.2941],
            [0.3059, 0.3059, 0.3059,  ..., 0.2941, 0.2941, 0.2941],
            [0.3059, 0.3059, 0.3059,  ..., 0.2941, 0.2941, 0.2941],
            ...,
            [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
            [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
            [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
  
           [[0.3098, 0.3098, 0.3098,  ..., 0.3137, 0.3137, 0.3137],
            [0.3098, 0.3098, 0.3098,  ..., 0.3137

Exp shows that generating attn map is much slower than generating CLS token embedding.

The reason underlying is unknown. It needs math.


--------------------------------------------------
Attention Map Generation Performance Metrics:
Time: 5326.52 ms
VRAM: Current usage: 2444.27 MB
VRAM: Peak increment: 8.12 MB

Generating embeddings...

--------------------------------------------------
Embedding Generation Performance Metrics:
Time: 568.71 ms
VRAM: Current usage: 2444.27 MB
VRAM: Peak increment: 0.00 MB




