In [1]:
import os
import random
import numpy as np
from PIL import Image
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import (
    AutoImageProcessor, 
    AutoModelForDepthEstimation, 
    TrainingArguments, 
    Trainer
)
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DepthDataset(Dataset):
    def __init__(self, pairs_list, images_path, depth_npy_path, image_processor):
        self.pairs_list = pairs_list  # Liste des paires (image_filename, depth_filename)
        self.images_path = images_path
        self.depth_npy_path = depth_npy_path
        self.image_processor = image_processor

    def __len__(self):
        return len(self.pairs_list)

    def __getitem__(self, idx):
        image_name, depth_npy_name = self.pairs_list[idx]
        
        # Chargement à la demande
        image_file = os.path.join(self.images_path, image_name)
        depth_npy_file = os.path.join(self.depth_npy_path, depth_npy_name)
        
        image = Image.open(image_file).convert("RGB")
        depth_numpy = np.load(depth_npy_file)
        
        # CORRECTION ICI : Gestion des canaux de profondeur
        # Si la depth a 3 canaux (H, W, 3), on ne garde que le premier (H, W)
        if len(depth_numpy.shape) == 3:
            depth_numpy = depth_numpy[:, :, 0]
        
        # Prétraitement de l'image
        inputs = self.image_processor(images=image, return_tensors="pt")
        
        # Conversion en Tensor
        depth_tensor = torch.from_numpy(depth_numpy).float()
        
        # On a maintenant une forme (H, W). 
        # On ajoute (Batch, Channel) pour obtenir (1, 1, H, W) requis par interpolate
        depth_tensor = depth_tensor.unsqueeze(0).unsqueeze(0)
        
        # Récupération de la taille cible
        target_size = inputs['pixel_values'].shape[-2:]
        
        # Interpolation
        depth_resized = F.interpolate(depth_tensor, size=target_size, mode='nearest')
        
        # On retire les dimensions pour revenir à (H, W) pour les labels
        depth_resized = depth_resized.squeeze()

        return {
            'pixel_values': inputs['pixel_values'].squeeze(0),
            'labels': depth_resized
        }

In [3]:
# 2. Chargement du Modèle et Processor
model_id = "depth-anything/Depth-Anything-V2-Small-hf"
image_processor = AutoImageProcessor.from_pretrained(model_id)
model = AutoModelForDepthEstimation.from_pretrained(model_id)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
# 4. Préparation des données (comme dans ton code original)
# Assure-toi que 'dataset' (ta classe DatasetImages) est bien instancié avant
dataset_path = "DATASET_DEVOIR"
images_path = os.path.join(dataset_path, "images")
depth_npy_path = os.path.join(dataset_path, "depth")
image_files = sorted(os.listdir(images_path))
depth_files = sorted(os.listdir(depth_npy_path))
# Filtrer pour s'assurer que les fichiers correspondent bien si nécessaire
all_pairs = list(zip(image_files, depth_files))
random.shuffle(all_pairs)

split_idx = int(0.8 * len(all_pairs))
train_pairs = all_pairs[:split_idx]
eval_pairs = all_pairs[split_idx:]

train_dataset = DepthDataset(train_pairs, images_path, depth_npy_path, image_processor)
eval_dataset = DepthDataset(eval_pairs, images_path, depth_npy_path, image_processor)

In [5]:
# Afficher les noms de toutes les couches (modules) du modèle
print("Noms de toutes les couches du modèle :")
for name, module in model.named_modules():
    if name:  # Éviter la racine vide
        print(name)

Noms de toutes les couches du modèle :
backbone
backbone.embeddings
backbone.embeddings.patch_embeddings
backbone.embeddings.patch_embeddings.projection
backbone.embeddings.dropout
backbone.encoder
backbone.encoder.layer
backbone.encoder.layer.0
backbone.encoder.layer.0.norm1
backbone.encoder.layer.0.attention
backbone.encoder.layer.0.attention.attention
backbone.encoder.layer.0.attention.attention.query
backbone.encoder.layer.0.attention.attention.key
backbone.encoder.layer.0.attention.attention.value
backbone.encoder.layer.0.attention.output
backbone.encoder.layer.0.attention.output.dense
backbone.encoder.layer.0.attention.output.dropout
backbone.encoder.layer.0.layer_scale1
backbone.encoder.layer.0.drop_path
backbone.encoder.layer.0.norm2
backbone.encoder.layer.0.mlp
backbone.encoder.layer.0.mlp.fc1
backbone.encoder.layer.0.mlp.activation
backbone.encoder.layer.0.mlp.fc2
backbone.encoder.layer.0.layer_scale2
backbone.encoder.layer.1
backbone.encoder.layer.1.norm1
backbone.encoder.la

In [6]:
# 3. Configuration LoRA Correcte pour la Vision
# On cible tous les modules linéaires du Transformer pour un meilleur apprentissage
# On retire 'task_type' pour éviter l'erreur "input_ids"
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value", "dense", "fc1", "fc2"], 
    lora_dropout=0.05,
    bias="none",
)

In [7]:
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 1,327,104 || all params: 26,112,193 || trainable%: 5.0823


In [8]:
# 5. Trainer Personnalisé pour gérer la Loss et les NaNs
class DepthTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        predicted_depth = outputs.predicted_depth
        
        # L'output du modèle peut être légèrement différent de la taille d'entrée (padding)
        # On s'assure que la prédiction matche les labels
        if predicted_depth.shape[-2:] != labels.shape[-2:]:
            predicted_depth = F.interpolate(
                predicted_depth.unsqueeze(1), 
                size=labels.shape[-2:], 
                mode='bilinear', 
                align_corners=False
            ).squeeze(1)

        # Masquage des valeurs invalides (NaNs ou inf)
        # On suppose que la profondeur valide est > 0 et n'est pas NaN
        valid_mask = ~torch.isnan(labels) & ~torch.isinf(labels) & (labels > 0)
        
        if valid_mask.sum() == 0:
            return torch.tensor(0.0, device=predicted_depth.device, requires_grad=True)

        # Calcul de la Loss (L1 Loss est souvent mieux pour la profondeur que MSE)
        loss = F.l1_loss(predicted_depth[valid_mask], labels[valid_mask])
        
        return (loss, outputs) if return_outputs else loss

In [9]:
# 6. Arguments d'entraînement
args = TrainingArguments(
    output_dir="output_depth_lora",
    remove_unused_columns=False, # Important pour garder 'labels'
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4, # Un peu plus bas pour LoRA
    per_device_train_batch_size=4, # Ajuste selon ta VRAM (128 est énorme pour des images)
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4, # Simule un batch plus grand
    fp16=True,
    num_train_epochs=50,
    logging_steps=10,
    label_names=["labels"], # Indique au Trainer de ne pas supprimer cette colonne
)

In [10]:
# Fonction de collation simple
def collate_fn(batch):
    pixel_values = torch.stack([item['pixel_values'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {'pixel_values': pixel_values, 'labels': labels}

In [None]:
# 7. Lancement
trainer = DepthTrainer(
    model=lora_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
)

trainer.train()

  return t.to(


Epoch,Training Loss,Validation Loss
1,No log,329.557007
2,No log,325.189728
