In [1]:
import os

os.environ["DINOV3_LOCATION"] = r"C:\Users\Jan Magne\OneDrive - Akershus fylkeskommune\dinov3"

In [2]:
# Sjekke om miljøvariabelen er satt riktig

print("DINOV3_LOCATION:", os.getenv("DINOV3_LOCATION"))

DINOV3_LOCATION: C:\Users\Jan Magne\OneDrive - Akershus fylkeskommune\dinov3


In [3]:
#importere nødvendige biblioteker

import os
import pickle
import torch
from PIL import Image
import numpy as np
from scipy import signal
import torchvision.transforms.functional as TF
import cv2


In [4]:
# sett DINOv3 location til miljøvariabelen

DINOV3_LOCATION = os.getenv("DINOV3_LOCATION")

if DINOV3_LOCATION is None:
    raise ValueError("DINOV3_LOCATION environment variabel er ikke satt. Se tidligere steg.")

print("DINOv3 location set to:", DINOV3_LOCATION)

DINOv3 location set to: C:\Users\Jan Magne\OneDrive - Akershus fylkeskommune\dinov3


In [5]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

2.8.0+cu129
True
12.9


In [6]:
# Vi starer med denne, da denne er minst og krever mindre ressurser
MODEL_DINOV3_VITS = "dinov3_vits16"

# Andre modeller du kan prøve
# Husk at du må laste ned "weights" for den modellen du ønsker å bruke
MODEL_DINOV3_VITSP = "dinov3_vits16plus"
MODEL_DINOV3_VITB = "dinov3_vitb16"
MODEL_DINOV3_VITL = "dinov3_vitl16"
MODEL_DINOV3_VITHP = "dinov3_vith16plus"
MODEL_DINOV3_VIT7B = "dinov3_vit7b16"

# we take DINOv3 ViT-S (since we have the pretrained weights for this model)
MODEL_NAME = MODEL_DINOV3_VITB # Her kan du velge en annen modell 

# Load model without pretrained weights to avoid web download
model = torch.hub.load(
    repo_or_dir=DINOV3_LOCATION,
    model=MODEL_NAME,
    source="local",
    pretrained=True 
)

# Set model to evaluation mode and move to GPU
model.eval()
model.cuda()


# Test the model with a small dummy input to see if it produces valid output
print("\nTesting model with dummy input...")
dummy_input = torch.randn(1, 3, 224, 224).cuda()
with torch.no_grad():
    try:
        test_output = model(dummy_input)
        print(f"Model test successful. Output shape: {test_output.shape}")
        print(f"Output has NaN: {torch.isnan(test_output).any()}")
        print(f"Output range: {test_output.min():.4f} to {test_output.max():.4f}")
    except Exception as e:
        print(f"Model test failed: {e}")
        print("There might be an issue with the model or checkpoint loading.")


Testing model with dummy input...
Model test successful. Output shape: torch.Size([1, 768])
Output has NaN: False
Output range: -2.1267 to 2.0255


In [7]:
# Konstanter for patch-størrelse og bilde-størrelse

PATCH_SIZE = 16    # Hver patch er 16×16 piksler
#IMAGE_SIZE = 768   # Standard høyde vi skalerer til (768÷16 = 48 patches høyt)
IMAGE_SIZE = 1536

In [8]:
# Denne funksjonen bruker vi videre for å endre størrelse på maskene slik at de passer med patch-størrelsen
def resize_transform(mask_image: Image, image_size: int = IMAGE_SIZE, patch_size: int = PATCH_SIZE) -> torch.Tensor:
    w, h = mask_image.size                              # Original størrelse
    h_patches = int(image_size / patch_size)            # Antall patches vertikalt (768÷16=48)
    w_patches = int((w * image_size) / (h * patch_size)) # Antall patches horisontalt
    return TF.to_tensor(TF.resize(mask_image, (h_patches * patch_size, w_patches * patch_size)))

In [10]:

with open('fg_classifier_VITB.pkl', 'rb') as f:
    clf = pickle.load(f)

# Self-attention lagene i DINOv3 ViT modellene
MODEL_TO_NUM_LAYERS = {
    MODEL_DINOV3_VITS: 12,
    MODEL_DINOV3_VITSP: 12,
    MODEL_DINOV3_VITB: 12,
    MODEL_DINOV3_VITL: 24,
    MODEL_DINOV3_VITHP: 32,
    MODEL_DINOV3_VIT7B: 40,
}

n_layers = MODEL_TO_NUM_LAYERS[MODEL_NAME]

IMAGENET_MEAN = (0.485, 0.456, 0.406) # RGB mean for ImageNet
IMAGENET_STD = (0.229, 0.224, 0.225) # RGB std for ImageNet

In [11]:


def show_video_with_foreground_mask(model, clf, resize_transform, 
                                   IMAGENET_MEAN, IMAGENET_STD, PATCH_SIZE, n_layers,
                                   apply_median_filter=True, show_original=True, 
                                   fps_limit=30, window_size=(1200, 600)):

    
    def process_frame_for_segmentation(frame):
        """Process a single frame and return foreground mask"""
        # Convert BGR to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_pil = Image.fromarray(frame_rgb)
        
        # Preprocess
        frame_resized = resize_transform(frame_pil)
        frame_normalized = TF.normalize(frame_resized, mean=IMAGENET_MEAN, std=IMAGENET_STD)
        
        # Extract features
        with torch.inference_mode():
            with torch.autocast(device_type='cuda', dtype=torch.float32):
                feats = model.get_intermediate_layers(
                    frame_normalized.unsqueeze(0).cuda(), 
                    n=range(n_layers), 
                    reshape=True, 
                    norm=True
                )
                x = feats[-1].squeeze().detach().cpu()
                dim = x.shape[0]
                x = x.view(dim, -1).permute(1, 0)
        
        # Get foreground scores
        h_patches, w_patches = [int(d / PATCH_SIZE) for d in frame_resized.shape[1:]]
        fg_score = clf.predict_proba(x)[:, 1].reshape(h_patches, w_patches)
        
        # Apply median filter if requested
        if apply_median_filter:
            fg_score = signal.medfilt2d(fg_score, kernel_size=3)
        
        return fg_score, frame_resized
    
    # Open video
    cap = cv2.VideoCapture(0)
    
    if not cap.isOpened():
        print("Error: Could not open webcam")
        return
    
    # Get video properties
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    original_fps = cap.get(cv2.CAP_PROP_FPS)
    
    print(f"Video: {total_frames} frames at {original_fps:.1f} FPS")
    print("Controls: 'q' to quit, 'space' to pause/resume, 's' to save frame")
    
    # Calculate frame delay for FPS limiting
    frame_delay = max(1, int(1000 / min(fps_limit, original_fps)))
    
    paused = False
    frame_count = 0
    
    while True:
        if not paused:
            ret, frame = cap.read()
            if not ret:
                print("End of video reached")
                break
            
            frame_count += 1
        
        try:
            # Process frame for segmentation
            fg_score, frame_resized = process_frame_for_segmentation(frame)
            
            # Convert tensors to numpy for display
            frame_np = (frame_resized.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
            
            # Create colorized mask
            fg_score_norm = (fg_score * 255).astype(np.uint8)
            mask_colored = cv2.applyColorMap(fg_score_norm, cv2.COLORMAP_HOT)
            
            # Resize mask to match frame size
            frame_height, frame_width = frame_np.shape[:2]
            mask_resized = cv2.resize(mask_colored, (frame_width, frame_height))
            
            if show_original:
                # Show original and mask side by side
                frame_bgr = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
                combined = np.hstack([frame_bgr, mask_resized])
                
                # Add text overlay
                cv2.putText(combined, f"Frame: {frame_count}/{total_frames}", 
                           (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
                cv2.putText(combined, "Original", (10, frame_height - 10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
                cv2.putText(combined, "Foreground Mask", (frame_width + 10, frame_height - 10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
                
                display_frame = combined
            else:
                # Show only the mask
                cv2.putText(mask_resized, f"Frame: {frame_count}/{total_frames}", 
                           (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
                display_frame = mask_resized
            
            # Resize for display if needed
            display_height, display_width = display_frame.shape[:2]
            if display_width > window_size[0] or display_height > window_size[1]:
                scale = min(window_size[0] / display_width, window_size[1] / display_height)
                new_width = int(display_width * scale)
                new_height = int(display_height * scale)
                display_frame = cv2.resize(display_frame, (new_width, new_height))
            
            # Show frame
            cv2.imshow('Foreground Segmentation', display_frame)
            
        except Exception as e:
            print(f"Error processing frame {frame_count}: {e}")
            continue
        
        # Handle keyboard input
        key = cv2.waitKey(frame_delay) & 0xFF
        
        if key == ord('q'):
            print("Quitting...")
            break
        elif key == ord(' '):  # Space bar
            paused = not paused
            print("Paused" if paused else "Resumed")
        elif key == ord('s'):  # Save frame
            filename = f"frame_{frame_count:06d}_segmentation.png"
            cv2.imwrite(filename, display_frame)
            print(f"Saved {filename}")
    
    # Cleanup
    cap.release()
    cv2.destroyAllWindows()
    print("Video display ended")


show_video_with_foreground_mask(
    model, clf, resize_transform,
    IMAGENET_MEAN, IMAGENET_STD, PATCH_SIZE, n_layers
)

Video: -1 frames at 30.0 FPS
Controls: 'q' to quit, 'space' to pause/resume, 's' to save frame
Quitting...
Video display ended
