In [1]:
import kagglehub
radmilasegen_dataset_path = kagglehub.dataset_download('radmilasegen/dataset')

print('Data source import complete.')

Data source import complete.


In [2]:
import os
from glob import glob

# BASE_PATH = "/root/.cache/kagglehub/datasets/radmilasegen/dataset/versions/1/new_ds"
BASE_PATH = "/kaggle/input/dataset/new_ds"
CLASSES_PATH = os.path.join(BASE_PATH, "merged_classes.txt")

In [3]:
import os
import random
import cv2
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from glob import glob

import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from torch.utils.data import Dataset, DataLoader

In [4]:
IMG_SIZE = 256

In [5]:
def load_classes(path=CLASSES_PATH):
    with open(path, "r") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

In [6]:
def read_label(label_path):
    objects = []
    with open(label_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 5:
                continue
            cls_id = int(parts[0])
            x, y, w, h = map(float, parts[1:])
            objects.append({
                "cls_id": cls_id,
                "class_name": CLASSES[cls_id],
                "x_center": x,
                "y_center": y,
                "width": w,
                "height": h
            })
    return objects

In [7]:
def collect_samples(base_path=BASE_PATH):
    samples = []
    subfolders = sorted([f for f in os.listdir(base_path) if f.isdigit()])

    for folder in subfolders:
        img_dir = os.path.join(base_path, folder, "images")
        depth_dir = os.path.join(base_path, folder, "depth")
        label_dir = os.path.join(base_path, folder, "labels")

        if not (os.path.isdir(img_dir) and os.path.isdir(depth_dir) and os.path.isdir(label_dir)):
            continue

        img_files = sorted(glob(os.path.join(img_dir, "*.png")))
        
        for img_path in img_files:
            fname = os.path.basename(img_path)
            depth_path = os.path.join(depth_dir, fname)
            label_path = os.path.join(label_dir, fname.replace(".png", ".txt"))

            if not os.path.exists(depth_path):
                continue
            if not os.path.exists(label_path):
                continue

            objs = read_label(label_path)
            if len(objs) == 0:
                continue

            samples.append({
                "rgb": img_path,
                "depth": depth_path,
                "label": label_path,
                "objects": objs
            })
    return samples

In [8]:
def compute_depth_min_max(samples):
    d_min = float("inf")
    d_max = float("-inf")
    for s in samples:
        depth = cv2.imread(s["depth"], cv2.IMREAD_ANYDEPTH).astype(np.float32)
        if depth.size == 0:
            continue
        m = depth.min()
        M = depth.max()
        if M <= 0:
            continue
        d_min = min(d_min, m)
        d_max = max(d_max, M)

    if not np.isfinite(d_min):
        d_min = 0.0
    if not np.isfinite(d_max) or d_max <= d_min:
        d_max = d_min + 1.0
    return d_min, d_max

In [9]:
class RGBDDetectionDataset(Dataset):
    def __init__(self, samples, img_size=IMG_SIZE):
        self.samples = samples
        self.img_size = img_size

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]

        rgb = cv2.imread(item["rgb"])
        rgb = cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
        rgb = cv2.resize(rgb, (self.img_size, self.img_size))
        rgb = rgb.astype(np.float32) 

        depth = cv2.imread(item["depth"], cv2.IMREAD_ANYDEPTH).astype(np.float32)
        depth = cv2.resize(depth, (self.img_size, self.img_size))

        d = np.clip(depth, DEPTH_MIN, DEPTH_MAX)
        d_norm = (d - DEPTH_MIN) / (DEPTH_MAX - DEPTH_MIN + 1e-6)
        d_scaled = d_norm * 255.0 

        depth_ch = d_scaled[..., None] 

        rgbd = np.concatenate([rgb, depth_ch], axis=2)
        rgbd = rgbd / 255.0
        rgbd = torch.from_numpy(rgbd).permute(2, 0, 1)

        boxes = []
        labels = []
        for obj in item["objects"]:
            cx = obj["x_center"]
            cy = obj["y_center"]
            w  = obj["width"]
            h  = obj["height"]

            x1 = (cx - w/2.0) * self.img_size
            y1 = (cy - h/2.0) * self.img_size
            x2 = (cx + w/2.0) * self.img_size
            y2 = (cy + h/2.0) * self.img_size

            x1 = np.clip(x1, 0, self.img_size-1)
            y1 = np.clip(y1, 0, self.img_size-1)
            x2 = np.clip(x2, 0, self.img_size-1)
            y2 = np.clip(y2, 0, self.img_size-1)

            boxes.append([x1, y1, x2, y2])
            labels.append(obj["cls_id"] + 1)

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([idx])
        }

        return rgbd, target

In [10]:
def collate_fn(batch):
    images, targets = list(zip(*batch))
    return list(images), list(targets)

In [11]:
def show_n_predictions(model, samples, n = 5, score_thresh=0.5):
    model.eval()
    chosen = random.sample(samples, min(5, len(samples)))

    for idx, sample in enumerate(chosen):
        print(f"\n============= SAMPLE {idx+1} =============")

        # читаем RGB/DEPTH так же, как в датасете
        rgb = cv2.imread(sample["rgb"])
        rgb = cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
        rgb = cv2.resize(rgb, (IMG_SIZE, IMG_SIZE))
        rgb_vis = rgb.copy()

        depth = cv2.imread(sample["depth"], cv2.IMREAD_ANYDEPTH).astype(np.float32)
        depth = cv2.resize(depth, (IMG_SIZE, IMG_SIZE))
        d = np.clip(depth, DEPTH_MIN, DEPTH_MAX)
        d_norm = (d - DEPTH_MIN) / (DEPTH_MAX - DEPTH_MIN + 1e-6)
        d_scaled = d_norm * 255.0
        depth_ch = d_scaled[..., None]

        rgbd = np.concatenate([rgb.astype(np.float32), depth_ch], axis=2) / 255.0
        rgbd_t = torch.from_numpy(rgbd).permute(2, 0, 1).float().to(device)

        with torch.no_grad():
            outputs = model([rgbd_t])[0]

        boxes = outputs["boxes"].cpu().numpy()
        labels = outputs["labels"].cpu().numpy()
        scores = outputs["scores"].cpu().numpy()

        for box, label, score in zip(boxes, labels, scores):
            if score < score_thresh:
                continue
            x1, y1, x2, y2 = box.astype(int)
            cls_id = int(label) - 1  # back to 0..C-1
            if cls_id < 0 or cls_id >= len(CLASSES):
                continue
            cls_name = CLASSES[cls_id]

            cv2.rectangle(rgb_vis, (x1, y1), (x2, y2), (255, 0, 0), 2)
            cv2.putText(
                rgb_vis,
                f"{cls_name} {score:.2f}",
                (x1, max(0, y1 - 5)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (255, 0, 0),
                1,
            )

        plt.figure(figsize=(8, 8))
        plt.title("Predictions")
        plt.imshow(rgb_vis)
        plt.axis("off")
        plt.show()

In [None]:
# multimodal detector (Faster R-CNN + ResNet-50 FPN)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

CLASSES = load_classes()
num_classes = len(CLASSES)
num_classes_with_bg = num_classes + 1
print("Классы:", CLASSES)

samples = collect_samples()
print("Всего валидных RGB+Depth+Label пар:", len(samples))
print("Пример sample:", samples[0])

DEPTH_MIN, DEPTH_MAX = compute_depth_min_max(samples)
print("DEPTH_MIN, DEPTH_MAX:", DEPTH_MIN, DEPTH_MAX)

random.shuffle(samples)
split = int(0.8 * len(samples))
train_samples = samples[:split]
val_samples   = samples[split:]

train_dataset = RGBDDetectionDataset(train_samples, IMG_SIZE)
val_dataset   = RGBDDetectionDataset(val_samples,   IMG_SIZE)

BATCH_SIZE = 4

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=(device == "cuda"),
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=(device == "cuda"),
    collate_fn=collate_fn
)

print("Train size:", len(train_dataset), "Val size:", len(val_dataset))

backbone = resnet_fpn_backbone('resnet50', weights=None, trainable_layers=5)
old_conv = backbone.body.conv1

backbone.body.conv1 = nn.Conv2d(
    in_channels=4,
    out_channels=old_conv.out_channels,
    kernel_size=old_conv.kernel_size,
    stride=old_conv.stride,
    padding=old_conv.padding,
    bias=(old_conv.bias is not None),
)

model = FasterRCNN(
    backbone,
    num_classes=num_classes_with_bg,
    # 4 канала → 4 значения mean/std
    image_mean=[0.5, 0.5, 0.5, 0.5],
    image_std=[0.25, 0.25, 0.25, 0.25],
).to(device)

print("Модель Faster R-CNN RGBD создана.")

EPOCHS = 10
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.001,
    momentum=0.9,
    weight_decay=1e-4
)

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    n_samples = 0

    for images, targets in train_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        bs = len(images)
        total_loss += losses.item() * bs
        n_samples += bs

    print(f"Epoch {epoch}/{EPOCHS} | train_loss = {total_loss / n_samples:.4f}")

torch.save(model.state_dict(), "rgbd_fasterrcnn_resnet50.pth")
print("Модель сохранена: rgbd_fasterrcnn_resnet50.pth")

show_n_predictions(model, samples, n = 5, score_thresh=0.6)

Device: cuda
Классы: ['bottle', 'box', 'cola', 'container', 'cube', 'duck', 'pods', 'scissors', 'sphere', 'tape', 'tor']
Всего валидных RGB+Depth+Label пар: 2424
Пример sample: {'rgb': '/kaggle/input/dataset/new_ds/1/images/00000.png', 'depth': '/kaggle/input/dataset/new_ds/1/depth/00000.png', 'label': '/kaggle/input/dataset/new_ds/1/labels/00000.txt', 'objects': [{'cls_id': 5, 'class_name': 'duck', 'x_center': 0.5128571428571428, 'y_center': 0.7398941798941798, 'width': 0.11523809523809515, 'height': 0.31153439153439166}]}
DEPTH_MIN, DEPTH_MAX: 0.0 65535.0
Train size: 1939 Val size: 485




Модель Faster R-CNN RGBD создана.
Epoch 1/10 | train_loss = 0.4456
