Remember to

```sh
sudo apt install ffmpeg libavcodec-extra
```

In [None]:
# # Download model checkpoints:
# import torch

# # dinov2_vits14_reg_lc = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14_reg_lc')
# dinov2_vits14_reg_lc = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14_lc')

# # Then move ckpt to dinov2/checkpoints

In [None]:

import sys
import os
import torch
import torch.nn.functional as F
import numpy as np
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt

if 'dinov2' in sys.modules:
    del sys.modules['dinov2']
    
# Get the absolute path of the project root
project_root = os.path.abspath("..")  # Adjust path if your notebook is deeper in directories

# Add project root to sys.path
sys.path.insert(0, project_root)

# sys.path.append(project_root)

    
# Now you can import dinov2
import dinov2
print("Currently using dinov2 from:", dinov2.__file__)



from dinov2.eval.setup import build_model_for_eval
from dinov2.configs import load_and_merge_config
from dinov2.utils.visualize import print_video_model_stats, load_and_preprocess_video, get_model_output, \
    two_stage_pca, compute_cosine_similarity, save_triple_image, plot_videos, plot_distance_chart

device = "cuda"
device = torch.device(device if torch.cuda.is_available() else "cpu")
print(device)

Load dinov2 (with registers) model

In [None]:
model_size = "s"
with_registers = "_reg4" # "_reg4"
# Use `dinov2_vits14_pretrain`
conf = load_and_merge_config(f'eval/vit{model_size}14{with_registers}_pretrain')
model = build_model_for_eval(conf, f'../dinov2/checkpoints/dinov2_vit{model_size}14{with_registers}_pretrain.pth')
# model = build_model_for_eval(conf, f'../dinov2/checkpoints/dinov2_vit{model_size}14_pretrain.pth')

THRESHOLD = 0.2


In [None]:
from numpy import ndarray


def compute_distance_in_feature_space(video1, video2, model, device, threshold=0.6):
    B, C, H, W, patch_size, embedding_dim, patch_num = print_video_model_stats(video1, model)
    
    # Get cls token embedding and patch token embeddings
    cls_token_emb1, patch_token_embs1 = get_model_output(model, video1)
    cls_token_emb2, patch_token_embs2 = get_model_output(model, video2)
    
     # Calculate similarity
     # Each cls token embedding is of shape (B, D) where B is the video length
    similarities = compute_cosine_similarity(cls_token_emb1, cls_token_emb2)
    normalized_distances = list(map(lambda sim: (1 - sim)/2, similarities))
    return normalized_distances

def plot(img1: np.ndarray, img2: np.ndarray, distance: float, output_path=None):
    """
    Plots two images side by side with the computed distance displayed as a title.

    Parameters:
        img1 (np.ndarray): The first image (original).
        img2 (np.ndarray): The second image (covered).
        distance (float): The computed distance between the two images.
        output_path (str, optional): Path to save the plot. If None, the plot is displayed.
    """
    # Create figure with stacked images (one above the other)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 12))

    ax1.imshow(img1)
    ax1.set_title("Original")
    ax1.axis('off')
    
    ax2.imshow(img2)
    ax2.set_title("Covered")
    ax2.axis('off')
    
    plt.suptitle(f"Distance: {distance:.6f}", fontsize=16)
    plt.tight_layout()
    
    # Save or display the figure
    if output_path:
        plt.savefig(output_path, dpi=150, bbox_inches='tight')
        plt.close()
    else:
        plt.show()


In [None]:
base_dir = "./data/pong/videos"
video1_name, video2_name = "pong", "pong"
video1_path, video2_path = os.path.join(base_dir, f"{video1_name}.avi"), os.path.join(base_dir, f"{video2_name}.avi")
output_dir = "./data/pong/plots/"
os.makedirs(output_dir, exist_ok=True)

from dinov2.utils.visualize import colorize_area
colorize_video2_hook = lambda video: colorize_area(video, color=[0, 255, 0], starting_location=(35, 17), width=4, height=4)  # Change the pixel at (0, 0) to black

video1_prenorm, video1_normalized, fps = load_and_preprocess_video(video1_path, target_size=448*1, patch_size = model.patch_size)# 448 is multiples of patch_size (14)
video2_prenorm, video2_normalized, fps = load_and_preprocess_video(video2_path, target_size=448*1, patch_size = model.patch_size, hook_function=colorize_video2_hook)# 448 is multiples of patch_size (14)


distance_list = compute_distance_in_feature_space(video1_prenorm, video2_prenorm, model, device)

video1_prenorm = video1_prenorm.permute(0, 2, 3, 1).cpu().numpy()  # Change to (T, H, W, C)
video2_prenorm = video2_prenorm.permute(0, 2, 3, 1).cpu().numpy()  # Change to (T, H, W, C)
T, H, W, C = video1_prenorm.shape

for t in range(T):
    if t > 3:
        break
    frame1, frame2 = video1_prenorm[t], video2_prenorm[t]
    distance = distance_list[t]
    
    plot(frame1, frame2, distance, output_path=None)
    print(f"Distance at frame {t}: {distance}")



In [None]:


plot_videos(video1_prenorm, video2_prenorm, distance_list, output_path=os.path.join(output_dir, f"{video2_name}.mp4"), fps=fps)
plot_distance_chart(distance_list, title="Frame-by-Frame Distance", output_path=os.path.join(output_dir, f"{video2_name}.png"))
avg_distance = np.mean(distance_list)
print(f"Average distance: {avg_distance:.6f}")

## Model Analysis: Size S with Registers

### Distance Measurements Between Original and Modified Videos

| Video 1 | Video 2 | Distance AVG (normalized img) | Distance AVG (unnormalized img) |
|---------|---------|-------------------------------|--------------------------------|
| Original | No ball | -- | 0.0039 |
| Original | No left paddle | -- | 0.0046 |
| Original | No right paddle | -- | 0.0067 |
| Original | No white bar | -- | 0.0409 |
| Original | Add a small 4×4 white area | -- | 0.0146 |
| Original | Add a small 4×4 red area | -- | 0.0026 |
| Original | Add a small 4×4 green area | -- | 0.0071 |
| Original | Add a small 2×2 white area | -- | 0.0068 |
| Original | Add one white pixel | -- | 0.0081 |
| Original | Add one red pixel | -- | 0.0006 |
| Original | Add one green pixel | -- | 0.0007 |

**Note:** The base distance resulting from MP4 conversion is 0.0112.

- A small pixel region can be less important than the ball (0.0006 < 0.0039 and 0.0026 < 0.0039)
- The importance ranking from most to least significant:
  1. White bar (0.0409)
  2. Right paddle (0.0067)
  3. Left paddle (0.0046)
  4. Ball (0.0039) ~= Small pixel regions

