# Visualize the last attention layer of DINOv2
This is the same visualization method used in DINOv1. I modified the ViT a bit to support the api used in DINOv1, this is inspired by [Tobias Ziegler](https://gitlab.com/ziegleto-machine-learning/dino).

Meanwhile, to removes the artifacts, please use model equipped with registers, see this [paper](https://arxiv.org/abs/2309.16588). The artifacts are not observed in DINOv1, whereas it leads to huge impact to DINOv2.


The results are still slightly worse compared to DINOv1.

In [67]:
import sys
import os
import torch
import numpy as np
from tqdm import tqdm

# Get the absolute path of the project root
project_root = os.path.abspath("..")  # Adjust path if your notebook is deeper in directories

# Add project root to sys.path
sys.path.append(project_root)

# We must disable xformers since there's no way to get attention map from xformers attention layers.
os.environ['XFORMERS_DISABLED'] = 'True'
print(os.environ.get("XFORMERS_DISABLED"))

# Now you can import dinov2
import dinov2
from dinov2.eval.setup import build_model_for_eval
from dinov2.configs import load_and_merge_config
from dinov2.utils.visualize import load_and_preprocess_video, print_video_model_stats, get_last_self_attn, get_attention_map, colorize_attention_map, save_np_array_as_video

device = "cuda"
device = torch.device(device if torch.cuda.is_available() else "cpu")
print(device)


# For pong, 97
# For quadruped, 95
PERCENTILE = 95
TARGET_SIZE = 896 # For a 64*64 video, we resize it to 896*896, and since the dinov2 patch size is 14, we will get a 64*64 attention map.

True
cuda


In [68]:
def main(video_path, model, output_path, device='cuda', masked_output_path = "", thresholded_output_path=""):
    video_prenorm, video_normalized, fps = load_and_preprocess_video(video_path, target_size=TARGET_SIZE, patch_size = model.patch_size)# 448 is multiples of patch_size (14)
    T, C, H, W, patch_size, embedding_dim, patch_num = print_video_model_stats(video_normalized, model)
    H_p, W_p = int(H/patch_size), int(W/patch_size)
    
    last_self_attention = get_last_self_attn(model, video_normalized)
    normalized_attention_map = get_attention_map(
        last_self_attention, height_in_patches = H_p, width_in_patches = W_p, num_register_tokens = model.num_register_tokens
    )# (T, H_p, W_p)
    
    
    # Resize the attention map for the single frame
    colorized_attn_map = colorize_attention_map(normalized_attention_map, patch_size=patch_size)
    save_np_array_as_video(colorized_attn_map, output_path=output_path, fps=fps)
        
    if thresholded_output_path:
        from dinov2.utils.visualize import find_percentile_threshold, generate_attention_mask
        threshold = find_percentile_threshold(normalized_attention_map, percentile=PERCENTILE)
        attention_mask = generate_attention_mask(normalized_attention_map, threshold=threshold)
        thresholded_attn_map = normalized_attention_map * attention_mask
    
        colorized_thresholded_attn_map = colorize_attention_map(thresholded_attn_map, patch_size=patch_size)
        save_np_array_as_video(colorized_thresholded_attn_map, output_path=thresholded_output_path, fps=fps)
        
        # Calculate statistics about filtered weights
        total_elements = attention_mask.size
        filtered_elements = total_elements - np.sum(attention_mask)
        filtered_percentage = (filtered_elements / total_elements) * 100
    
        # Print statistics
        print(f"Threshold value: {threshold:.4f}")
        print(f"Filtered out {filtered_elements:,} of {total_elements:,} attention weights")
        print(f"Filtered out {filtered_percentage:.2f}% of attention weights")
        
        if masked_output_path:
            import cv2
            from dinov2.utils.visualize import min_max_normalize
        
        
            # Option 1: Simple nearest-neighbor expansion of each patch (blocky result)
            upsampled_attn_mask = np.zeros((T, H, W), dtype=np.uint8)
            for t in range(T):
                for h in range(H_p):
                    for w in range(W_p):
                        h_start, h_end = h * patch_size, (h + 1) * patch_size
                        w_start, w_end = w * patch_size, (w + 1) * patch_size
                        upsampled_attn_mask[t, h_start:h_end, w_start:w_end] = attention_mask[t, h, w]
            # Expand attention mask map to match video dimensions
            expanded_attn_mask = np.expand_dims(upsampled_attn_mask, axis=-1)  # Shape: (T, H, W, 1)
            expanded_attn_mask = np.repeat(expanded_attn_mask, C, axis=-1)     # Shape: (T, H, W, C)
        
            # Multiply original video with attention map mask
            masked_video = video_prenorm.permute(0, 2, 3, 1).detach().cpu().numpy() * expanded_attn_mask
            masked_video = min_max_normalize(masked_video)
            save_np_array_as_video(masked_video, output_path=masked_output_path, fps=fps)
    


In [None]:
model_size = "s"
base_dir = "./data"
# exp_name = "natural"
# video_name = "dog_first5sec"
# video_name = "dog_cat_first5sec"

exp_name = "ms_pacman"
video_name = "ms_pacman"

video_path = f"{base_dir}/{exp_name}/videos/{video_name}.mp4"
output_path = f"{base_dir}/{exp_name}/outputs/{video_name}_attn_{model_size}.mp4"
masked_output_path = f"{base_dir}/{exp_name}/outputs/{video_name}_top_{100-PERCENTILE}_percent_attn_masked_{model_size}.mp4"
thresholded_output_path = f"{base_dir}/{exp_name}/outputs/{video_name}_top_{100-PERCENTILE}_percent_attn_{model_size}.mp4"


# Use `dinov2_vitb14_pretrain`
conf = load_and_merge_config(f'eval/vit{model_size}14_reg4_pretrain')
model = build_model_for_eval(conf, f'../dinov2/checkpoints/dinov2_vit{model_size}14_reg4_pretrain.pth')

main(video_path, model, output_path, device, masked_output_path = masked_output_path, thresholded_output_path=thresholded_output_path)


Video FPS: 20.00, Total Frames: 128, Duration: 6.40 seconds
Input tensor shape: Batch=128, Channels=3, Height=896, Width=896
Patch size: 14
Embedding dimension: 384
Number of patches of each image: 4096


100%|██████████| 128/128 [00:36<00:00,  3.56it/s]


Video saved to ./data/breakout/outputs/breakout_attn_s.mp4
Video saved to ./data/breakout/outputs/breakout_top_5_percent_attn_s.mp4
Threshold value: 0.0695
Filtered out 498,073.0 of 524,288 attention weights
Filtered out 95.00% of attention weights
Video saved to ./data/breakout/outputs/breakout_top_5_percent_attn_masked_s.mp4


Comparison of visualized attn map of original video and masked video shows that DINO representation space preserve even very subtle signal, i.e., the ball/paddles is almost totally masked in the masked video, but the attn map still contains it.