In [1]:
!pip install transformers



In [3]:
from transformers import DetrFeatureExtractor, DetrForObjectDetection

2025-07-26 08:25:09.348516: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753518309.500570      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753518309.547936      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
import torch

In [7]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'facebook/detr-resnet-101-dc5'

In [10]:
feature_extractor = DetrFeatureExtractor.from_pretrained(model_name)
model = DetrForObjectDetection.from_pretrained(model_name).to(DEVICE)

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
import torch.nn as nn

num_classes = 3

model.class_labels_classifier = nn.Linear(model.config.d_model, num_classes + 1)  # +1 for images without mines
model.config.num_labels = num_classes

In [12]:
from transformers import DetrImageProcessor

id2label = {
    0: "Explosives (All sorts of UXO, grenades, etc. that does not fall into AP/AV mine classes)", 
    1: "Anti-personnel mine", 
    2: "Anti-vehicle mine"
}
label2id = {v: k for k, v in id2label.items()}

processor = DetrImageProcessor.from_pretrained(model_name)
model.config.id2label = id2label
model.config.label2id = label2id

In [13]:
def validate(model, val_dataloader):
  model.eval()
  running_loss = 0.0
  for batch in val_dataloader:
    pixel_values = batch["pixel_values"].to(DEVICE)
    pixel_mask = batch["pixel_mask"].to(DEVICE)
    labels = [{k: v.to(DEVICE) for k, v in t.items()} for t in batch["labels"]]

    model_output = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
    loss = model_output["loss"]
    running_loss += loss.item()
  if DEVICE == "cuda":
    torch.cuda.empty_cache()
  return running_loss / len(val_dataloader)

In [99]:
def train(model, num_epoch, train_dataloader, val_dataloader, optimizer, scheduler, device):
  model.train()
  for epoch in range(num_epoch):
    for batch in train_dataloader:
      optimizer.zero_grad()

      pixel_values = batch["pixel_values"].to(device)
      pixel_mask = batch["pixel_mask"].to(device)
      labels = [{k: v.to(device) for k, v in t.items()} for t in batch["labels"]]

      model_output = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
      loss = model_output["loss"]

      loss.backward()
      optimizer.step()
    scheduler.step()
    torch.cuda.empty_cache()
    print(f"Epoch: {epoch}, Training loss: {float(loss.item())}")
    val_loss = validate(model, val_dataloader)
    print(f"Epoch: {epoch}, Validation loss: {val_loss}")
    model.train()

In [15]:
from torch.optim.lr_scheduler import StepLR, ExponentialLR

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
# scheduler = ExponentialLR(optimizer, gamma=0.9)

In [1]:
import pandas as pd
import os

In [2]:
folder_path = "/kaggle/input/uadamage-demining-competition/train/annotations"

In [3]:
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Read and combine them into one DataFrame
df_list = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

print(combined_df.head())

                                            image_id label     x     y width  \
0  0fe78fe9d92a221824bb46e63bae648dab551506f4cd5c...     0  3759  3741   133   
1  ef87e4060b04f09fbc665c32042598b231a6d8a5e78b12...     1  2292  1805    67   
2  1b7a46422e95fe21c97a5a7c92c2cbccb554411f80ca62...     0  3434  2032    55   
3  b829bfb91340ab2d3af2123e4c14860451a6aa0dd8451a...     0  1594  1561    36   
4  cb535576553cb2927691603948b420fe541ac5ff644b5b...     1  2185  1207    57   

  height image_width image_height  
0    115        5280         3956  
1     76        4000         3000  
2     56        3840         2160  
3    128        3227         3226  
4     57        4000         3000  


In [7]:
combined_df.head()

Unnamed: 0,image_id,label,x,y,width,height,image_width,image_height
0,0fe78fe9d92a221824bb46e63bae648dab551506f4cd5c...,0,3759,3741,133,115,5280,3956
1,ef87e4060b04f09fbc665c32042598b231a6d8a5e78b12...,1,2292,1805,67,76,4000,3000
2,1b7a46422e95fe21c97a5a7c92c2cbccb554411f80ca62...,0,3434,2032,55,56,3840,2160
3,b829bfb91340ab2d3af2123e4c14860451a6aa0dd8451a...,0,1594,1561,36,128,3227,3226
4,cb535576553cb2927691603948b420fe541ac5ff644b5b...,1,2185,1207,57,57,4000,3000


In [21]:
image_folder = "/kaggle/input/uadamage-demining-competition/train/images"

In [33]:
import numpy as np

In [62]:
max_image_width = combined_df['image_width'].max()
max_image_height = combined_df['image_height'].max()

In [64]:
max_image_width

13911

In [65]:
max_image_height

13182

In [63]:
min_image_width = combined_df['image_width'].min()
min_image_height = combined_df['image_height'].min()

In [81]:
from torch.utils.data import Dataset
from PIL import Image

class CocoFormatDataset(Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.image_folder = image_folder
        self.processor = processor
        self.data = dataframe.groupby("image_id")
        self.ids = list(self.data.groups.keys())

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        image_id = self.ids[idx]
        group = self.data.get_group(image_id)

        image_path = f"{self.image_folder}/{image_id}.jpg"
        image = Image.open(image_path).convert("RGB")

        annotations = []
        for _, row in group.iterrows():
            x, y, w, h = float(row["x"]), float(row["y"]), float(row["width"]), float(row["height"])
            category_id = int(row["label"])
            annotations.append({
                "bbox": [x, y, w, h],
                "area": w * h,
                "category_id": category_id
            })

        target = {
            "image_id": idx,
            "annotations": annotations
        }

        # Encode inputs
        size = {
            "shortest_edge": 640,
            "longest_edge": 13911
        }
        encoding = self.processor(images=image, annotations=target, return_tensors="pt", size={'height': 640, 'width': 640})
        encoding = {
            k: (v.squeeze(0) if isinstance(v, torch.Tensor) else v)
            for k, v in encoding.items()
        }
        return encoding


In [93]:
def move_label_to_device(label, device):
    # label can be a dict or list of dicts, but here assume dict
    if isinstance(label, list):
        # If label is a list of dicts, move each dict
        return [{k: v.to(device) for k, v in l.items()} for l in label]
    else:
        return {k: v.to(device) for k, v in label.items()}


In [102]:
from torch.utils.data import DataLoader, random_split

dataset = CocoFormatDataset(combined_df, image_folder, processor)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: {
    "pixel_values": torch.stack([i["pixel_values"] for i in x]),
    "pixel_mask": torch.stack([i["pixel_mask"] for i in x]),
    "labels": [move_label_to_device(i["labels"][0] if isinstance(i["labels"], list) else i["labels"], DEVICE) for i in x]
})
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: {
    "pixel_values": torch.stack([i["pixel_values"] for i in x]),
    "pixel_mask": torch.stack([i["pixel_mask"] for i in x]),
    "labels": [move_label_to_device(i["labels"][0] if isinstance(i["labels"], list) else i["labels"], DEVICE) for i in x]
})


In [96]:
model.to('cuda')

DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): FeatureListNet(
          (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (bn1): DetrFrozenBatchNorm2d()
          (act1): ReLU(inplace=True)
          (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          (layer1): Sequential(
            (0): Bottleneck(
              (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn1): DetrFrozenBatchNorm2d()
              (act1): ReLU(inplace=True)
              (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (bn2): DetrFrozenBatchNorm2d()
              (drop_block): Identity()
              (act2): ReLU(inplace=True)
              (aa): Identity()
              (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      

In [97]:
num_epoches = 1

train(model, num_epoch=num_epoches, train_dataloader=train_loader, val_dataloader=val_loader, optimizer=optimizer, scheduler=scheduler, device=DEVICE)



Epoch: 0, Training loss: 4.455962657928467


TypeError: validate() takes 2 positional arguments but 3 were given

In [103]:
num_epoches = 5

train(model, num_epoch=num_epoches, train_dataloader=train_loader, val_dataloader=val_loader, optimizer=optimizer, scheduler=scheduler, device=DEVICE)



Epoch: 0, Training loss: 3.6609911918640137
Epoch: 0, Validation loss: 3.8208777886028438
Epoch: 1, Training loss: 3.4690310955047607
Epoch: 1, Validation loss: 3.6939996711967527
Epoch: 2, Training loss: 3.6269371509552
Epoch: 2, Validation loss: 4.119842100513074
Epoch: 3, Training loss: 4.2913103103637695
Epoch: 3, Validation loss: 5.0156597333361015
Epoch: 4, Training loss: 3.8866004943847656
Epoch: 4, Validation loss: 3.694206962289736


In [104]:
num_epoches = 5

train(model, num_epoch=num_epoches, train_dataloader=train_loader, val_dataloader=val_loader, optimizer=optimizer, scheduler=scheduler, device=DEVICE)

Epoch: 0, Training loss: 3.7004446983337402
Epoch: 0, Validation loss: 3.800423446551774


KeyboardInterrupt: 

In [None]:
num_epoches = 10

train(model, num_epoch=num_epoches, train_dataloader=train_loader, val_dataloader=val_loader, optimizer=optimizer, scheduler=scheduler, device=DEVICE)

Epoch: 0, Training loss: 3.6952943801879883
Epoch: 0, Validation loss: 6.474891093350196
Epoch: 1, Training loss: 3.6181132793426514
Epoch: 1, Validation loss: 7.253440162008123
Epoch: 2, Training loss: 3.660289764404297
Epoch: 2, Validation loss: 8.138101078743158
