# Visualize the last attention layer of DINOv2
This is the same visualization method used in DINOv1

https://gitlab.com/ziegleto-machine-learning/dino/-/blob/main/visualize_attention.py

The results are bad compared to DINOv1, this [paper](https://arxiv.org/pdf/2309.16588) may explain the reason.

In [None]:
import sys
import os
import torch
import numpy as np
from tqdm import tqdm

# Get the absolute path of the project root
project_root = os.path.abspath("..")  # Adjust path if your notebook is deeper in directories

# Add project root to sys.path
sys.path.append(project_root)

# We must disable xformers since there's no way to get attention map from xformers attention layers.
os.environ['XFORMERS_DISABLED'] = 'True'
print(os.environ.get("XFORMERS_DISABLED"))

# Now you can import dinov2
import dinov2
from dinov2.eval.setup import build_model_for_eval
from dinov2.configs import load_and_merge_config
from dinov2.utils.visualize import *

device = "cuda"
device = torch.device(device if torch.cuda.is_available() else "cpu")
print(device)




In [None]:
def main(video_path, model, output_path, device='cuda'):
    
    raw_tensor, input_tensor, fps = load_preprocess_video(video_path, target_size=None, patch_size = model.patch_size)# 448 is multiples of patch_size (14)
    B, C, H, W, patch_size, embedding_dim, patch_num = print_video_model_stats(input_tensor, model)
    
    colorized_attention_map_list = []
    last_selfattention_list = []
    with torch.no_grad():
        for i in tqdm(range(B), desc="Processing Frames", unit="frame"):
            frame = input_tensor[i].unsqueeze(0)  # Add batch dimension for the model
            
            
            # Forward pass for the single frame
            last_selfattention = model.get_last_selfattention(frame).detach().cpu().numpy()
            last_selfattention_list.append(last_selfattention)
    
    last_selfattentions = np.vstack(last_selfattention_list)  # (B, num_heads, num_tokens, num_tokens)
    
    # Resize the attention map for the single frame
    colorized_attn_maps = attention_visualize(last_selfattentions, height_in_patches = int(H/patch_size), width_in_patches = int(W/patch_size), patch_size=patch_size)
            
    
    save_np_array_as_video(colorized_attn_maps, output_path=output_path, fps=fps)


In [None]:
# from dinov2.models.vision_transformer import vit_small, vit_large

# model = vit_large(
#             patch_size=14,
#             img_size=526,
#             init_values=1.0,
#             #ffn_layer="mlp",
#             block_chunks=0
#     )

# model.load_state_dict(torch.load('../dinov2/checkpoints/dinov2_vitl14_pretrain.pth'))
# for p in model.parameters():
#     p.requires_grad = False
# model.to(device)
# model.eval()

video_path = "/home/lyk/Projects/dinov2/notebooks/data/videos/crane_video.mp4"
output_path = "/home/lyk/Projects/dinov2/notebooks/data/outputs/attn_video_crane.mp4"

model_size = "b"

video_path = "./data/videos/dinov2_dog.mp4"
output_path = f"./data/outputs/attn_dinov2_dog_{model_size}.mp4"


# Use `dinov2_vitb14_pretrain`
conf = load_and_merge_config(f'eval/vit{model_size}14_pretrain')
model = build_model_for_eval(conf, f'../dinov2/checkpoints/dinov2_vit{model_size}14_pretrain.pth')

main(video_path, model, output_path, device)
