Remember to

```sh
sudo apt install ffmpeg libavcodec-extra
```

In [1]:
import sys
import os
import torch
import numpy as np

# Get the absolute path of the project root
project_root = os.path.abspath("..")  # Adjust path if your notebook is deeper in directories

# Add project root to sys.path
sys.path.append(project_root)

# Now you can import dinov2
import dinov2
from dinov2.eval.setup import build_model_for_eval
from dinov2.configs import load_and_merge_config
from dinov2.utils.visualize import *

device = "cuda"
device = torch.device(device if torch.cuda.is_available() else "cpu")
print(device)



cuda


In [2]:

# Main pipeline execution
def main(video_path, model, threshold=0.6, device='cuda'):
    raw_tensor, input_tensor = load_preprocess_video(video_path, target_size=448)# 448 is multiples of patch_size (14)
    print_video_model_stats(input_tensor, model)
    
    patch_embeddings = get_patch_embeddings(model, input_tensor)
    reduced_fg_patch_embeddings, nums_of_fg_patches, masks, reduced_patch_embeddings = two_stage_pca(patch_embeddings, threshold=threshold)
    
    save_triple_video(raw_tensor, reduced_fg_patch_embeddings, nums_of_fg_patches, masks, reduced_patch_embeddings, patch_embeddings.shape[1], model.patch_size, output_path='/home/lyk/Projects/dinov2/notebooks/data/outputs/triple_output.mp4')


In [3]:
# Use model as large as as possible
# The video len should be longer

video_path = "/home/lyk/Projects/dinov2/notebooks/data/videos/01_dog.mp4"
# video_path = "/home/lyk/Projects/dinov2/notebooks/data/videos/crane_video.mp4"

threshold=0.6

# Use `dinov2_vitb14_pretrain`
conf = load_and_merge_config('eval/vitb14_pretrain')
model = build_model_for_eval(conf, '../dinov2/checkpoints/dinov2_vitb14_pretrain.pth')

main(video_path, model, threshold, device)

Preprocessed video tensor shape: torch.Size([7, 3, 448, 448])
Input tensor shape: Batch=7, Channels=3, Height=448, Width=448
Patch size: 14
Embedding dimension: 768
Number of patches of each image: 1024
Num of foreground patches of image 0: 581
Num of foreground patches of image 1: 583
Num of foreground patches of image 2: 582
Num of foreground patches of image 3: 581
Num of foreground patches of image 4: 579
Num of foreground patches of image 5: 578
Num of foreground patches of image 6: 579
Total num of foreground patches: 4063
Explained variance ratio by PCA components: [0.1660797  0.10861541 0.09618893]
Triple video saved to /home/lyk/Projects/dinov2/notebooks/data/outputs/triple_output.mp4
