# Acknowledgment

This implementation is fully based on the following code:

- code: https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Train_a_linear_classifier_on_top_of_DINOv2_for_semantic_segmentation.ipynb
- Author: NielsRogge

# Description
- Task: depth estimation
- Dataset: NYUd
- Model: DINOv2-base
- Evaluation: Linear-1

# NYUd dataset download

In [None]:
!pip3 install kaggle
# Need to set the APIs

In [3]:
!kaggle datasets download soumikrakshit/nyu-depth-v2

Dataset URL: https://www.kaggle.com/datasets/soumikrakshit/nyu-depth-v2
License(s): unknown
Downloading nyu-depth-v2.zip to /root
100%|█████████████████████████████████████▉| 4.10G/4.10G [05:48<00:00, 13.2MB/s]
100%|██████████████████████████████████████| 4.10G/4.10G [05:48<00:00, 12.6MB/s]


In [None]:
!unzip -o filename.zip

# Transfrom original dataset into huggingface DatasetDict

In [29]:
import os
from datasets import Dataset, DatasetDict
from PIL import Image
import pandas as pd

In [30]:
def load_sample(example):
    color_image_path = example['color_image']
    depth_image_path = example['depth_image']
    
    color_image = Image.open(color_image_path)
    depth_image = Image.open(depth_image_path)
    
    return {
        'color_image': color_image,
        'depth_image': depth_image
    }

In [31]:
base_dir_data = './nyu_data/data'

train_csv = pd.read_csv(os.path.join(base_dir_data, 'nyu2_train.csv'), header=None)
train_csv.columns = ['color_image', 'depth_image']

test_csv = pd.read_csv(os.path.join(base_dir_data, 'nyu2_test.csv'), header=None)
test_csv.columns = ['color_image', 'depth_image']

In [32]:
base_dir = './nyu_data/'
train_csv['color_image'] = train_csv['color_image'].apply(lambda x: os.path.join(base_dir, x))
train_csv['depth_image'] = train_csv['depth_image'].apply(lambda x: os.path.join(base_dir, x))

test_csv['color_image'] = test_csv['color_image'].apply(lambda x: os.path.join(base_dir, x))
test_csv['depth_image'] = test_csv['depth_image'].apply(lambda x: os.path.join(base_dir, x))

In [33]:
train_dataset = Dataset.from_pandas(train_csv)
test_dataset = Dataset.from_pandas(test_csv)

In [34]:
train_dataset

Dataset({
    features: ['color_image', 'depth_image'],
    num_rows: 50688
})

In [35]:
train_dataset = train_dataset.map(load_sample)
test_dataset = test_dataset.map(load_sample)

Map:   0%|          | 0/50688 [00:00<?, ? examples/s]

Map:   0%|          | 0/654 [00:00<?, ? examples/s]

In [37]:
train_dataset[0]

{'color_image': {'bytes': None,
  'path': '/root/nyu_data/data/nyu2_train/living_room_0038_out/37.jpg'},
 'depth_image': {'bytes': None,
  'path': '/root/nyu_data/data/nyu2_train/living_room_0038_out/37.png'}}

In [38]:
# use part data of origial train set
subset_train = train_dataset.shuffle(seed=42).select(range(22000))
split_dataset = subset_train.train_test_split(test_size=2000)

trains = split_dataset['train']
vals = split_dataset['test']

dataset = DatasetDict({
    'train': trains,
    'val': vals,
})

dataset

DatasetDict({
    train: Dataset({
        features: ['color_image', 'depth_image'],
        num_rows: 20000
    })
    val: Dataset({
        features: ['color_image', 'depth_image'],
        num_rows: 2000
    })
})

# Installations and Imports

In [10]:
Install = False
if Install:
    !pip3 install evaluate
    !pip3 install transformers
    !pip3 install datasets
    !pip3 install albumentations

In [11]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [39]:
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import evaluate
import numpy as np
from PIL import Image
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from transformers import Dinov2Model, Dinov2PreTrainedModel
from transformers.modeling_outputs import SemanticSegmenterOutput
from datasets import load_dataset
import albumentations as A
from datasets import load_from_disk
from datasets import Dataset, DatasetDict

# Functions and Classes

In [40]:
def generate_id2label(num_labels):

    id2label = {}

    for i in range(num_labels):
        id2label[i] = f"label{i}"
    
    return id2label

def get_transform():
    
    ADE_MEAN = np.array([123.675, 116.280, 103.530]) / 255
    ADE_STD = np.array([58.395, 57.120, 57.375]) / 255

    train_transform = A.Compose([
        A.Resize(width=448, height=448),
        A.HorizontalFlip(p=0.5),
        A.Normalize(mean=ADE_MEAN.tolist(), std=ADE_STD.tolist()),
    ])

    val_transform = A.Compose([
        A.Resize(width=448, height=448),
        A.Normalize(mean=ADE_MEAN.tolist(), std=ADE_STD.tolist()),

    ])
    
    return train_transform, val_transform

def collate_fn(inputs):
    batch = dict()
    batch["pixel_values"] = torch.stack([i[0] for i in inputs], dim=0)
    batch["labels"] = torch.stack([i[1] for i in inputs], dim=0)
    batch["original_images"] = [i[2] for i in inputs]
    batch["original_segmentation_maps"] = [i[3] for i in inputs]

    return batch

def train_model(model, train_dataloader, metric, id2label, learning_rate=1e-5, epochs=3, device=None):

    for name, param in model.named_parameters():
        if name.startswith("dinov2"):
            param.requires_grad = False
    
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    model.to(device)

    model.train()
    
    metrics_history = []

    for epoch in range(epochs):
        print("Epoch:", epoch)
        for idx, batch in enumerate(tqdm(train_dataloader)):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(pixel_values, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            optimizer.zero_grad()

            with torch.no_grad():
                predicted = outputs.logits.argmax(dim=1)
                
                metric.add_batch(predictions=predicted.detach().cpu().numpy(), 
                                 references=labels.detach().cpu().numpy())

            if idx % 100 == 0:
                metrics = metric.compute(num_labels=len(id2label),
                                        ignore_index=0,
                                        reduce_labels=False)
                
                metrics_history.append({
                    'epoch': epoch,
                    'batch_idx': idx,
                    'mean_iou': metrics['mean_iou'],
                    'mean_accuracy': metrics['mean_accuracy']
                })                
                
                print("Loss:", loss.item())
                print("Mean IOU:", metrics["mean_iou"])
                print("Mean Accuracy:", metrics["mean_accuracy"])
                print("----------------------------------")
    
    return metrics_history

def val_eval(model, val_dataloader, metric, id2label, device=None):

    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    model.to(device)

    model.eval()

    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(pixel_values, labels=labels)

            predicted = outputs.logits.argmax(dim=1)

            metric.add_batch(predictions=predicted.detach().cpu().numpy(), 
                             references=labels.detach().cpu().numpy())

    final_metrics = metric.compute(num_labels=len(id2label),
                                   ignore_index=0,
                                   reduce_labels=False)

    print(f"Final Mean IOU: {final_metrics['mean_iou']}")
    print(f"Final Mean Accuracy: {final_metrics['mean_accuracy']}")

In [41]:
class DepthDataset(Dataset):
  def __init__(self, dataset, transform):
    self.dataset = dataset
    self.transform = transform

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    item = self.dataset[idx]
    original_image = np.array(Image.open(item['color_image']['path']))
    original_depth_map = np.array(Image.open(item['depth_image']['path']))

    transformed = self.transform(image=original_image, mask=original_depth_map)
    image, target = torch.tensor(transformed['image']), torch.LongTensor(transformed['mask'])
    
    if image.dim() == 2:
        image = image.unsqueeze(2)
        image = image.expand(-1, -1, 3)

    image = image.permute(2, 0, 1)

    return image, target, original_image, original_depth_map

In [42]:
class LinearClassifier(torch.nn.Module):
    def __init__(self, in_channels, tokenW=32, tokenH=32, num_labels=256):
        super(LinearClassifier, self).__init__()

        self.in_channels = in_channels
        self.width = tokenW
        self.height = tokenH
        self.classifier = torch.nn.Conv2d(in_channels, num_labels, (1,1))

    def forward(self, embeddings):
        embeddings = embeddings.reshape(-1, self.height, self.width, self.in_channels)
        embeddings = embeddings.permute(0,3,1,2)

        return self.classifier(embeddings)

class Dinov2ForDepthEstimation(Dinov2PreTrainedModel):
  def __init__(self, config):
    super().__init__(config)

    self.dinov2 = Dinov2Model(config)
    self.classifier = LinearClassifier(config.hidden_size * 2, 32, 32, 256)

  def forward(self, pixel_values, output_hidden_states=False, output_attentions=False, labels=None):

    outputs = self.dinov2(pixel_values,
                            output_hidden_states=output_hidden_states,
                            output_attentions=output_attentions)

    patch_embeddings = outputs.last_hidden_state[:, 1:, :]
    cls_token = outputs.last_hidden_state[:, 0, :].unsqueeze(1).repeat(1, patch_embeddings.size(1), 1)
    patch_embeddings = torch.cat((patch_embeddings, cls_token), dim=2)

    logits = self.classifier(patch_embeddings)
    logits = torch.nn.functional.interpolate(logits, size=(pixel_values.shape[2] * 4, pixel_values.shape[3] * 4), mode="bilinear", align_corners=False)
    labels = torch.nn.functional.interpolate(depth_labels.unsqueeze(1), 
                                                   size=(pixel_values.shape[2] * 4, pixel_values.shape[3] * 4), 
                                                   mode="nearest").squeeze(1)
      
    loss = None
    if labels is not None:

      loss_fct = torch.nn.CrossEntropyLoss(ignore_index=0)
      loss = loss_fct(logits.squeeze(), labels.squeeze())

    return SemanticSegmenterOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

# Hyperparameters

In [43]:
num_labels = 256
id2label = generate_id2label(num_labels)

batch_size = 16
learning_rate = 1e-5
epochs = 5

# Dataset Processing

In [45]:
train_transform, val_transform = get_transform()

trainset = DepthDataset(dataset["train"], transform=train_transform)
valset = DepthDataset(dataset["val"], transform=val_transform)

train_dataloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(valset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
next(iter(train_dataloader))

# Train and Evaluation

### Base Model

In [None]:
model = Dinov2ForDepthEstimation.from_pretrained("facebook/dinov2-base", id2label=id2label, num_labels=len(id2label))

In [26]:
metric = evaluate.load("mean_iou")
metric_val = evaluate.load("mean_iou")

In [None]:
results = train_model(model, train_dataloader, metric, id2label, learning_rate=learning_rate, epochs=epochs, device=None)

In [None]:
val_eval(model, val_dataloader, metric_val, id2label, device=None)

In [17]:
with open("./depth_nyud_base_linear1.txt", "w") as file:
    for item in results:
        file.write(f"{item}\n")

### Small Model

In [None]:
# model = Dinov2ForSemanticSegmentationLinear.from_pretrained("facebook/dinov2-small", id2label=id2label, num_labels=len(id2label))

In [14]:
# metric = evaluate.load("mean_iou")
# metric_val = evaluate.load("mean_iou")

In [None]:
# results = train_model(model, train_dataloader, metric, id2label, learning_rate=learning_rate, epochs=epochs, device=None)

In [None]:
# val_eval_linear(model, val_dataloader, metric_val, id2label, device=None)

In [17]:
# with open(".s/depth_nyud_small_linear1.txt", "w") as file:
#     for item in results:
#        file.write(f"{item}\n")