In [1]:
import os
from PIL import Image
import numpy as np
from torch.utils.data import Dataset
import random

In [2]:
class DatasetImages():

    def __init__(self, dataset_path: str):
        self.dataset_path = dataset_path
        self.images_path = os.path.join(dataset_path, "images")
        self.depth_npy_path = os.path.join(dataset_path, "depth")
        self.images_dict = {}
        self.depth_npy_dict = {}

        self.load_images()
        self.load_depth_npy()

    def load_images(self):
        image_filenames = os.listdir(self.images_path)
        for image_name in image_filenames:
            image_file = os.path.join(self.images_path, image_name)
            self.images_dict[image_name] = Image.open(image_file)

    def load_depth_npy(self):
        depth_npy_filenames = os.listdir(self.depth_npy_path)
        for depth_npy_name in depth_npy_filenames:
            depth_npy_file = os.path.join(self.depth_npy_path, depth_npy_name)
            self.depth_npy_dict[depth_npy_name] = np.load(depth_npy_file)

In [3]:
dataset = DatasetImages("DATASET_DEVOIR")

In [4]:
dataset.images_dict

{'21-12-03-18-50-31_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1944x1200>,
 '21-12-03-18-50-34_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1944x1200>,
 '21-12-03-18-50-37_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1944x1200>,
 '21-12-03-18-50-39_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1944x1200>,
 '21-12-03-18-50-55_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1944x1200>,
 '21-12-03-18-52-27_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1944x1200>,
 '21-12-03-18-52-30_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1944x1200>,
 '21-12-03-18-52-32_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1944x1200>,
 '21-12-03-18-52-35_Zivid_acquisition_color.png': <PIL.PngImagePlugin.PngImageFi

In [5]:
dataset.depth_npy_dict

{'21-12-03-18-50-31_Zivid_acquisition_rawDepth.npy': array([[[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan],
         ...,
         [nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],
 
        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan],
         ...,
         [nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],
 
        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan],
         ...,
         [nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],
 
        ...,
 
        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan],
         ...,
         [nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],
 
        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan],
         ...,
         [nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],
 
        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, n

In [6]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    r=16,
    task_type=TaskType.FEATURE_EXTRACTION,
    target_modules=["query", "value"],
    lora_alpha=32,
    lora_dropout=0.05
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import torch
import numpy as np

image_processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [8]:
from peft import get_peft_model

lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 25,080,001 || trainable%: 1.1759


In [9]:
class DepthDataset(Dataset):
    def __init__(self, pairs, images_path, depth_path, image_processor):
        self.pairs = pairs
        self.images_path = images_path
        self.depth_path = depth_path
        self.image_processor = image_processor

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        img_name, depth_name = self.pairs[idx]
        image = Image.open(os.path.join(self.images_path, img_name))
        depth = np.load(os.path.join(self.depth_path, depth_name))
        inputs = self.image_processor(images=image, return_tensors="pt")
        labels = torch.from_numpy(depth).float()
        return {
            'pixel_values': inputs['pixel_values'].squeeze(0),
            'labels': labels
        }

In [10]:
image_files = sorted(os.listdir(dataset.images_path))
depth_files = sorted(os.listdir(dataset.depth_npy_path))
all_pairs = list(zip(image_files, depth_files))
random.shuffle(all_pairs)
split_idx = int(0.8 * len(all_pairs))
train_pairs = all_pairs[:split_idx]
eval_pairs = all_pairs[split_idx:]
train_dataset = DepthDataset(train_pairs, dataset.images_path, dataset.depth_npy_path, image_processor)
eval_dataset = DepthDataset(eval_pairs, dataset.images_path, dataset.depth_npy_path, image_processor)

In [11]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    fp16=True,
    num_train_epochs=5,
    load_best_model_at_end=True,
)

In [12]:
def collate_fn(batch):
    pixel_values = torch.stack([item['pixel_values'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {'pixel_values': pixel_values, 'labels': labels}

In [13]:
trainer = Trainer(
    model=lora_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=image_processor,
    data_collator=collate_fn,
)

  return t.to(


In [15]:
trainer.train()

TypeError: DepthAnythingForDepthEstimation.forward() got an unexpected keyword argument 'input_ids'