# 1. Notebook Introduction
This is a utility notebook that performs the following operations:
- First, it loads one of the biggest Depth Anything V3 model (DA3NESTED-GIANT-LARGE) and feeds the model with one random image in order to assure that the model works correctly.
- After that, the notebook loads the original CamVid dataset and generates the necessary new data used on this project (depth maps, confidence maps, segmentation masks...)

# 2. Imports

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

import torch
import torchvision.transforms.functional as TF
from torchvision.utils import save_image

from depth_anything_3.api import DepthAnything3

# 3. Loading model
Download and creation of the Depth Anything V3 model.
Caution, it is almost 7GB!!

In [None]:
# Load model from Hugging Face Hub
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE")
model = model.to(device=device)
model.eval()
print('Model loaded!')

# 4. Toy Example
This is just a toy example to assure that the prediction of the model works fine.

In [None]:
toy_image = ['/home/alumno/Desktop/datos/Computer Vision/depth-anything-3/CamVid/train/0001TP_009210.png']

In [None]:
toy_pred = model.inference(
    toy_image,
    process_res=960
)
print(toy_pred.depth[0].shape)
print(toy_pred.conf[0].shape)

In [None]:
print(type(toy_pred.depth[0]))
print(type(toy_pred.conf[0]))

In [None]:
torch.cuda.is_available()

In [None]:
# Convert numpy array to tensor
depth_tensor = torch.from_numpy(toy_pred.depth[0])

# Add batch and channel dimensions: (H, W) -> (1, 1, H, W)
depth_tensor = depth_tensor.unsqueeze(0).unsqueeze(0)

# Resize to 720x960
resized = TF.resize(depth_tensor, size=[720, 960], antialias=True)

# Remove batch and channel dimensions: (1, 1, H, W) -> (H, W)
resized = resized.squeeze(0).squeeze(0)

# Convert back to numpy if needed
resized_numpy = resized.numpy()

In [None]:
plt.imshow(resized_numpy, cmap='Spectral')

In [None]:
# Save as .npy file (preserves exact float values)
np.save('depth_map.npy', resized_numpy)

# Load it back later
loaded = np.load('depth_map.npy')

In [None]:
plt.imshow(loaded, cmap='Spectral')

# 5. Dataset Generation

## 5.1 Depth maps generation

### Train split

In [None]:
INPUT_DIR = '../CamVid/train/'
OUTPUT_DIR = '../CamVid/train_labels/'
DEPTH_MAP_DIR = 'train_depths/'
CONFS_DIR = 'train_confs/'
IMAGES = os.listdir(INPUT_DIR)

In [None]:
for img in tqdm(IMAGES):
    pred = model.inference(
        [INPUT_DIR + img],
        process_res=960
    )

    # Convert numpy array to tensor
    depth_tensor = torch.from_numpy(pred.depth[0])
    
    # Add batch and channel dimensions: (H, W) -> (1, 1, H, W)
    depth_tensor = depth_tensor.unsqueeze(0).unsqueeze(0)
    
    # Resize to 720x960
    resized = TF.resize(depth_tensor, size=[720, 960], antialias=True)
    
    # Remove batch and channel dimensions: (1, 1, H, W) -> (H, W)
    resized = resized.squeeze(0).squeeze(0)
    
    # Convert back to numpy if needed
    resized_numpy = resized.numpy()

    np.save(OUTPUT_DIR + DEPTH_MAP_DIR + img.replace('png', 'npy'), resized_numpy)

    conf_tensor = torch.from_numpy(pred.conf[0])
    conf_tensor = conf_tensor.unsqueeze(0).unsqueeze(0)
    resized_conf = TF.resize(conf_tensor, size=[720, 960], antialias=True)
    resized_conf_numpy = resized_conf.squeeze(0).squeeze(0).numpy()
    
    # Save resized confidence
    np.save(OUTPUT_DIR + CONFS_DIR + img.replace('png', 'npy'), resized_conf_numpy)

### Val split

In [None]:
INPUT_DIR = '../CamVid/val/'
OUTPUT_DIR = '../CamVid/val_labels/'
DEPTH_MAP_DIR = 'val_depths/'
CONFS_DIR = 'val_confs/'
IMAGES = os.listdir(INPUT_DIR)

In [None]:
for img in tqdm(IMAGES):
    pred = model.inference(
        [INPUT_DIR + img],
        process_res=960
    )

    # Convert numpy array to tensor
    depth_tensor = torch.from_numpy(pred.depth[0])
    
    # Add batch and channel dimensions: (H, W) -> (1, 1, H, W)
    depth_tensor = depth_tensor.unsqueeze(0).unsqueeze(0)
    
    # Resize to 720x960
    resized = TF.resize(depth_tensor, size=[720, 960], antialias=True)
    
    # Remove batch and channel dimensions: (1, 1, H, W) -> (H, W)
    resized = resized.squeeze(0).squeeze(0)
    
    # Convert back to numpy if needed
    resized_numpy = resized.numpy()

    np.save(OUTPUT_DIR + DEPTH_MAP_DIR + img.replace('png', 'npy'), resized_numpy)
    
    conf_tensor = torch.from_numpy(pred.conf[0])
    conf_tensor = conf_tensor.unsqueeze(0).unsqueeze(0)
    resized_conf = TF.resize(conf_tensor, size=[720, 960], antialias=True)
    resized_conf_numpy = resized_conf.squeeze(0).squeeze(0).numpy()
    
    # Save resized confidence
    np.save(OUTPUT_DIR + CONFS_DIR + img.replace('png', 'npy'), resized_conf_numpy)

### Test split

In [None]:
INPUT_DIR = '../CamVid/test/'
OUTPUT_DIR = '../CamVid/test_labels/'
DEPTH_MAP_DIR = 'test_depths/'
CONFS_DIR = 'test_confs/'
IMAGES = os.listdir(INPUT_DIR)

In [None]:
for img in tqdm(IMAGES):
    pred = model.inference(
        [INPUT_DIR + img],
        process_res=960
    )

    # Convert numpy array to tensor
    depth_tensor = torch.from_numpy(pred.depth[0])
    
    # Add batch and channel dimensions: (H, W) -> (1, 1, H, W)
    depth_tensor = depth_tensor.unsqueeze(0).unsqueeze(0)
    
    # Resize to 720x960
    resized = TF.resize(depth_tensor, size=[720, 960], antialias=True)
    
    # Remove batch and channel dimensions: (1, 1, H, W) -> (H, W)
    resized = resized.squeeze(0).squeeze(0)
    
    # Convert back to numpy if needed
    resized_numpy = resized.numpy()

    np.save(OUTPUT_DIR + DEPTH_MAP_DIR + img.replace('png', 'npy'), resized_numpy)
    
    conf_tensor = torch.from_numpy(pred.conf[0])
    conf_tensor = conf_tensor.unsqueeze(0).unsqueeze(0)
    resized_conf = TF.resize(conf_tensor, size=[720, 960], antialias=True)
    resized_conf_numpy = resized_conf.squeeze(0).squeeze(0).numpy()
    
    # Save resized confidence
    np.save(OUTPUT_DIR + CONFS_DIR + img.replace('png', 'npy'), resized_conf_numpy)

## 5.2 Save Segmentation Masks as NumPy Arrays
Using precomputed segmentation masks will speed up MultiTaskUnet training

In [None]:
class SegmentationLUT:
    """
    Ultra-fast RGB to class_id conversion using 3D Lookup Table.
    Creates a 256x256x256 LUT that maps any RGB value directly to class_id.
    """
    def __init__(self, class_dict_path):
        # Load class dictionary
        self.rgb_to_class_id = {}
        self.class_id_to_name = {}
        
        with open(class_dict_path) as f:
            next(f)  # Skip header
            for idx, line in enumerate(f):
                name, r, g, b = line.strip().split(",")
                self.rgb_to_class_id[(int(r), int(g), int(b))] = idx
                self.class_id_to_name[idx] = name
        
        self.num_classes = len(self.rgb_to_class_id)
        
        # Create 3D LUT: 256x256x256 -> class_id
        # Default to 0 (background) or -1 for unknown colors
        print("Creating 3D Lookup Table for segmentation masks...")
        self.lut = np.zeros((256, 256, 256), dtype=np.int64)
        
        # Fill LUT with class mappings
        for (r, g, b), class_id in self.rgb_to_class_id.items():
            self.lut[r, g, b] = class_id
        
        print(f"LUT created. {len(self.rgb_to_class_id)} classes mapped.")
    
    def rgb_to_labels(self, seg_rgb):
        """
        Convert RGB segmentation mask to class labels using LUT.
        
        Args:
            seg_rgb: numpy array of shape (H, W, 3) with uint8 RGB values
            
        Returns:
            labels: numpy array of shape (H, W) with int64 class IDs
        """
        # Direct indexing into LUT - O(H*W) operation, extremely fast
        return self.lut[seg_rgb[:, :, 0], seg_rgb[:, :, 1], seg_rgb[:, :, 2]]

In [None]:
def preprocess_segmentation_masks(seg_dir, output_dir, class_dict_path):
    """
    Pre-convert all RGB segmentation masks to class ID numpy arrays.
    Run this ONCE before training.
    """    
    os.makedirs(output_dir, exist_ok=True)
    
    # Create LUT
    lut = SegmentationLUT(class_dict_path)
    
    seg_files = sorted([f for f in os.listdir(seg_dir) if f.endswith('.png')])
    
    print(f"Pre-processing {len(seg_files)} segmentation masks...")
    for filename in tqdm(seg_files):
        # Load RGB mask
        seg_path = os.path.join(seg_dir, filename)
        seg_rgb = np.array(Image.open(seg_path).convert('RGB'), dtype=np.uint8)
        
        # Convert to class IDs
        seg_labels = lut.rgb_to_labels(seg_rgb)
        
        # Save as numpy array
        output_path = os.path.join(output_dir, filename.replace('.png', '.npy'))
        np.save(output_path, seg_labels.astype(np.int16))  # int16 saves space
    
    print(f"Saved preprocessed masks to {output_dir}")

In [None]:
TRAIN_SEG_DIR = '../CamVid/train_labels/train_seg/'
VAL_SEG_DIR = '../CamVid/val_labels/val_seg/'
TEST_SEG_DIR = '../CamVid/test_labels/test_seg/'
CLASS_DICT_PATH = '../CamVid/class_dict.csv'

preprocess_segmentation_masks(TRAIN_SEG_DIR, '../CamVid/train_labels/train_seg_npy/', CLASS_DICT_PATH)
preprocess_segmentation_masks(VAL_SEG_DIR, '../CamVid/val_labels/val_seg_npy/', CLASS_DICT_PATH)
preprocess_segmentation_masks(TEST_SEG_DIR, '../CamVid/test_labels/test_seg_npy/', CLASS_DICT_PATH)