# Import des libraries

In [1]:
!pip install -q transformers

In [2]:
!pip install pycocotools



# Importation

In [3]:
from torch.utils.data import DataLoader, Subset
import torchvision.transforms as T
from pycocotools.coco import COCO
from torchvision.datasets import CocoDetection
from transformers import AutoImageProcessor, DeformableDetrForObjectDetection
import numpy as np
import random

# Préparation du dataset

In [4]:
!curl -L https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128.zip -o coco128.zip
!unzip -q coco128.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 6819k  100 6819k    0     0  5728k      0  0:00:01  0:00:01 --:--:-- 5728k
replace coco128/LICENSE? [y]es, [n]o, [A]ll, [N]one, [r]ename: A


```
coco128/
├── images/
│   ├── train2017/
│   │   ├── 000000000001.jpg
│   │   ├── 000000000002.jpg
│   │   ├── ...
│   └── val2017/
│       ├── 000000000001.jpg
│       ├── 000000000002.jpg
│       ├── ...
├── annotations/
│   ├── instances_train2017.json
│   ├── instances_val2017.json


In [5]:
import shutil
import os
import random
from pathlib import Path

# Dossiers
images_dir = Path("/content/coco128/images/train2017")
labels_dir = Path("/content/coco128/labels/train2017")

# Créer des dossiers pour val2017
val_images_dir = Path("/content/coco128/images/val2017")
val_labels_dir = Path("/content/coco128/labels/val2017")
os.makedirs(val_images_dir, exist_ok=True)
os.makedirs(val_labels_dir, exist_ok=True)

# Sélectionner 20% des images pour la validation
images = list(images_dir.glob("*.jpg"))
random.shuffle(images)
val_images = images[:int(len(images) * 0.2)]
train_images = images[int(len(images) * 0.2):]

# Déplacer les images et leurs annotations correspondantes
for img_path in val_images:
    shutil.move(img_path, val_images_dir / img_path.name)
    label_path = labels_dir / f"{img_path.stem}.txt"
    if label_path.exists():
        shutil.move(label_path, val_labels_dir / f"{img_path.stem}.txt")

# Vérifier
print(f"Nombre d'images dans train2017 : {len(train_images)}")
print(f"Nombre d'images dans val2017 : {len(val_images)}")


Nombre d'images dans train2017 : 103
Nombre d'images dans val2017 : 25


In [6]:
print(f"Train images: {len(train_images)}")
print(f"Validation images: {len(val_images)}")

# Vérification de l'existence des fichiers
for img in train_images:
    if not img.exists():
        print(f"Image missing: {img}")
for img in val_images:
    if not img.exists():
        print(f"Image missing: {img}")


Train images: 103
Validation images: 25
Image missing: /content/coco128/images/train2017/000000000612.jpg
Image missing: /content/coco128/images/train2017/000000000061.jpg
Image missing: /content/coco128/images/train2017/000000000077.jpg
Image missing: /content/coco128/images/train2017/000000000072.jpg
Image missing: /content/coco128/images/train2017/000000000389.jpg
Image missing: /content/coco128/images/train2017/000000000049.jpg
Image missing: /content/coco128/images/train2017/000000000472.jpg
Image missing: /content/coco128/images/train2017/000000000536.jpg
Image missing: /content/coco128/images/train2017/000000000562.jpg
Image missing: /content/coco128/images/train2017/000000000110.jpg
Image missing: /content/coco128/images/train2017/000000000042.jpg
Image missing: /content/coco128/images/train2017/000000000605.jpg
Image missing: /content/coco128/images/train2017/000000000397.jpg
Image missing: /content/coco128/images/train2017/000000000359.jpg
Image missing: /content/coco128/imag

In [7]:
train_images = list(Path("/content/coco128/images/train2017").glob("*.jpg"))
val_images = list(Path("/content/coco128/images/val2017").glob("*.jpg"))


In [8]:
import os
import json
from PIL import Image
from pathlib import Path


# Dictionnaire COCO de sortie
coco_dict_train = {
    "images": [],
    "annotations": [],
    "categories": []
}
coco_dict_val = {
    "images": [],
    "annotations": [],
    "categories": []
}

# Mapping YOLO class_id → label name
# Tu peux adapter cela avec les vrais noms si tu les as
coco_class_names = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
    "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
    "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
    "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
    "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
    "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
    "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
    "toothbrush"
]
  # À adapter
for idx, name in enumerate(coco_class_names):
    coco_dict_train["categories"].append({
        "id": idx,
        "name": name,
        "supercategory": "object"
    })
    coco_dict_val["categories"].append({
        "id": idx,
        "name": name,
        "supercategory": "object"
    })

# Annotation pour l'entraînement et la validation
annotation_id_train = 1
annotation_id_val = 1

# Traiter les images d'entraînement
for image_id, image_path in enumerate(sorted(train_images)):
    img = Image.open(image_path)
    width, height = img.size

    coco_dict_train["images"].append({
        "id": image_id,
        "file_name": image_path.name,
        "width": width,
        "height": height
    })

    label_path = labels_dir / f"{image_path.stem}.txt"
    if not label_path.exists():
        continue

    with open(label_path, "r") as f:
        for line in f.readlines():
            parts = line.strip().split()
            if len(parts) != 5:
                continue
            class_id, x_center, y_center, w, h = map(float, parts)
            class_id = int(class_id)

            # Convertir en format COCO (bbox = [x_min, y_min, width, height])
            x_min = (x_center - w / 2) * width
            y_min = (y_center - h / 2) * height
            bbox_width = w * width
            bbox_height = h * height

            coco_dict_train["annotations"].append({
                "id": annotation_id_train,
                "image_id": image_id,
                "category_id": class_id,
                "bbox": [x_min, y_min, bbox_width, bbox_height],
                "area": bbox_width * bbox_height,
                "iscrowd": 0
            })
            annotation_id_train += 1

# Traiter les images de validation
for image_id, image_path in enumerate(sorted(val_images)):
    img = Image.open(image_path)
    width, height = img.size

    coco_dict_val["images"].append({
        "id": image_id,
        "file_name": image_path.name,
        "width": width,
        "height": height
    })

    label_path = val_labels_dir / f"{image_path.stem}.txt"
    if not label_path.exists():
        continue

    with open(label_path, "r") as f:
        for line in f.readlines():
            parts = line.strip().split()
            if len(parts) != 5:
                continue
            class_id, x_center, y_center, w, h = map(float, parts)
            class_id = int(class_id)

            # Convertir en format COCO (bbox = [x_min, y_min, width, height])
            x_min = (x_center - w / 2) * width
            y_min = (y_center - h / 2) * height
            bbox_width = w * width
            bbox_height = h * height

            coco_dict_val["annotations"].append({
                "id": annotation_id_val,
                "image_id": image_id,
                "category_id": class_id,
                "bbox": [x_min, y_min, bbox_width, bbox_height],
                "area": bbox_width * bbox_height,
                "iscrowd": 0
            })
            annotation_id_val += 1

# Sauvegarder les fichiers d'annotations
train_output_path = "/content/coco128/annotations/instances_train2017.json"
val_output_path = "/content/coco128/annotations/instances_val2017.json"

os.makedirs(os.path.dirname(train_output_path), exist_ok=True)
os.makedirs(os.path.dirname(val_output_path), exist_ok=True)

with open(train_output_path, "w") as f:
    json.dump(coco_dict_train, f, indent=2)

with open(val_output_path, "w") as f:
    json.dump(coco_dict_val, f, indent=2)

print(f"Annotations COCO sauvegardées pour l'entraînement dans {train_output_path}")
print(f"Annotations COCO sauvegardées pour la validation dans {val_output_path}")

Annotations COCO sauvegardées pour l'entraînement dans /content/coco128/annotations/instances_train2017.json
Annotations COCO sauvegardées pour la validation dans /content/coco128/annotations/instances_val2017.json


Les étapes précèdentes sont là afin que le dataset possède le format voulu.

In [9]:
import torch
class CocoWrapper(torch.utils.data.Dataset):
    def __init__(self, coco_dataset, processor):
        self.coco_dataset = coco_dataset
        self.processor = processor

    def __getitem__(self, idx):
        # Charger l'image et les annotations à partir du dataset COCO
        image, annotation_list = self.coco_dataset[idx]

        image_id = self.coco_dataset.ids[idx]

        # Formatage des annotations pour le modèle
        formatted_target = {
            "image_id": image_id,
            "annotations": annotation_list  # Dictionnaire d'annotations COCO
        }

        # Utilisation du processor pour traiter l'image et les annotations
        encoding = self.processor(images=image, annotations=formatted_target, return_tensors="pt")

        return encoding, formatted_target

    def __len__(self):
        return len(self.coco_dataset)





In [15]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection

processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
model = AutoModelForObjectDetection.from_pretrained("SenseTime/deformable-detr")

num_classes = model.config.num_labels
print(f"Model expects {num_classes} classes.")

coco_class_names = coco_class_names[:num_classes]

coco_dataset = CocoDetection(
    root='/content/coco128/images/train2017',
    annFile='/content/coco128/annotations/instances_train2017.json',
    transform=None
)

wrapped_dataset = CocoWrapper(coco_dataset,processor)
indices = torch.randperm(len(wrapped_dataset)).tolist()
train_dataset = Subset(wrapped_dataset, indices[:-50])
valid_dataset = Subset(wrapped_dataset, indices[-50:])





Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/161M [00:00<?, ?B/s]

Some weights of the model checkpoint at SenseTime/deformable-detr were not used when initializing DeformableDetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DeformableDetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DeformableDetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model expects 91 classes.
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [17]:
from PIL import Image
import torch
import torchvision.transforms.functional as F

def collate_fn(batch):
    print(f"Batch size: {len(batch)}")  # Vérifier la taille du lot
    # Unpack the batch - each element is a tuple of (encoding, target)
    encodings, targets = list(zip(*batch))

    # Vérifier les clés dans un élément de 'targets'
    print(f"Clés dans un target : {targets[0].keys()}")

    # Resize images to a uniform size
    #max_width = max(item['pixel_values'][0].shape[2] for item in encodings)
    #max_height = max(item['pixel_values'][0].shape[1] for item in encodings)

        # Exemple de redimensionnement
    max_width = 800  # Taille maximale réduite
    max_height = 800

    resized_pixel_values = []
    for item in encodings:
        # Resize using torchvision.transforms.functional.resize
        resized_image = F.resize(item['pixel_values'][0], (max_height, max_width))
        resized_pixel_values.append(resized_image)

    pixel_values = torch.stack(resized_pixel_values)

    # Create a list of image_ids
    image_ids = [item['image_id'] for item in targets]

    # Modification pour extraire les labels correctement
    labels = [
    {
        "class_labels": torch.tensor([ann['category_id'] for ann in target['annotations']]),  # Liste des IDs de catégories
        "boxes": torch.tensor([ann['bbox'] for ann in target['annotations']])  # Liste des boîtes de délimitation (bbox)
    }
    for target in targets
]

    # Créer un dictionnaire de lot
    batch = {
        'pixel_values': pixel_values,
        'image_id': torch.tensor(image_ids),  # Ensure image_ids are in tensor format
        'labels': labels  # Ajouter les labels s'ils existent
    }

    # Déplacer les tensors sur le device
    batch['pixel_values'] = batch['pixel_values'].to(device)
    batch['image_id'] = batch['image_id'].to(device)  # added for moving image_id if it is a tensor

    # # Déplacer les labels sur le device si ce sont des tensors
    # # This isn't needed with this fix, as labels will be a list of lists, but
    # # If in the future, the labels are tensors, use:
    # if isinstance(batch['labels'], torch.Tensor):
    #     batch['labels'] = batch['labels'].to(device)

    return batch








train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

print(f"Train Loader Size: {len(train_loader)}")
print(f"Valid Loader Size: {len(valid_loader)}")

# Lancer un batch d'exemple pour voir ce qui se passe
for batch in train_loader:
    print("Exemple de batch: ", batch)
    break  # Arrêter après le premier batch pour voir les données


Train Loader Size: 53
Valid Loader Size: 50
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Exemple de batch:  {'pixel_values': tensor([[[[-0.2856, -0.2856, -0.2856,  ...,  2.2489,  2.2489,  2.2489],
          [-0.2856, -0.2856, -0.2856,  ...,  2.2489,  2.2489,  2.2489],
          [-0.2856, -0.2856, -0.2856,  ...,  2.2489,  2.2489,  2.2489],
          ...,
          [-1.7069, -1.7069, -1.7069,  ..., -2.0266, -2.0266, -2.0266],
          [-1.7069, -1.7069, -1.7069,  ..., -2.0437, -2.0437, -2.0437],
          [-1.7069, -1.7069, -1.7069,  ..., -2.0772, -2.0772, -2.0772]],

         [[-0.8452, -0.8452, -0.8452,  ...,  2.3936,  2.3936,  2.3936],
          [-0.8452, -0.8452, -0.8452,  ...,  2.3936,  2.3936,  2.3936],
          [-0.8452, -0.8452, -0.8452,  ...,  2.3936,  2.3936,  2.3936],
          ...,
          [-1.8431, -1.8431, -1.8431,  ..., -1.9424, -1.9424, -1.9424],
          [-1.8431, -1.8431, -1.8431,  ..., -1.9599, -1.9599, -1.9599],
          [-1.8431, -

# Mise en place du modèle

In [18]:

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# Train

In [19]:
for epoch in range(5):
    model.train()
    running_loss = 0.0

    for batch in train_loader:
        # Move only the tensors to the device
        pixel_values = batch['pixel_values'].to(device)
        # The labels are a list of lists, and can't be moved to a device.
        labels = batch['labels']

        # Only pass the expected arguments to the model
        outputs = model(pixel_values=pixel_values, labels=labels)

        # Access the loss correctly for AutoModelForObjectDetection
        loss = outputs.loss
        # If the above doesn't work, try:
        # loss = outputs.logits.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"📊 Epoch {epoch + 1} - Loss: {running_loss / len(train_loader):.4f}")
    lr_scheduler.step()

print("\n✅ Entraînement terminé")




Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans un target : dict_keys(['image_id', 'annotations'])
Batch size: 1
Clés dans u

# Sauvegarde

In [20]:
torch.save(model.state_dict(), 'deformable_detr.pth')