In [19]:
import random
import os
import numpy as np
import torch
import torchvision
from torch.utils.data import DataLoader
from torchvision.datasets import CocoDetection
from torchvision.transforms.functional import to_tensor, normalize, hflip, vflip
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torch.amp import autocast, GradScaler
from torch.optim.lr_scheduler import OneCycleLR, ReduceLROnPlateau
from torch import nn
from PIL import Image

# 1) reproducibility & device
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2) dataset paths
IMG_DIR   = "/home/messyas/data/images"
ANN_TRAIN = "/home/messyas/data/annotations/10_class_splits/annotations_train.json"
ANN_VAL   = "/home/messyas/data/annotations/10_class_splits/annotations_val.json"

# 3) transforms
class TrainTransform:
    def __init__(self, base_size=512, scale_range=(500, 700), noise_std=0.02):  # narrower scales and less noise
        self.base_size = base_size
        self.scale_range = scale_range
        self.noise_std = noise_std
        self.mean = [0.485, 0.456, 0.406]
        self.std = [0.229, 0.224, 0.225]

    def __call__(self, image, target):
        # random scale
        s = random.randint(self.scale_range[0], self.scale_range[1])
        image = image.resize((s, s), Image.BILINEAR)
        ow, oh = image.size
        # horizontal flip
        if random.random() < 0.5:
            image = hflip(image)
            for obj in target:
                x, y, w, h = obj["bbox"]
                obj["bbox"][0] = ow - x - w
        # vertical flip
        if random.random() < 0.5:
            image = vflip(image)
            for obj in target:
                x, y, w, h = obj["bbox"]
                obj["bbox"][1] = oh - y - h
        # resize to base
        image = image.resize((self.base_size, self.base_size), Image.BILINEAR)
        tensor = to_tensor(image)
        tensor = tensor + torch.randn_like(tensor) * self.noise_std
        tensor = normalize(tensor, mean=self.mean, std=self.std)
        # adjust boxes
        sx = self.base_size / ow
        sy = self.base_size / oh
        for obj in target:
            x, y, w, h = obj["bbox"]
            obj["bbox"] = [x * sx, y * sy, w * sx, h * sy]
        return tensor, target

class EvalTransform:
    def __init__(self, size=512):
        self.size = size
        self.mean = [0.485, 0.456, 0.406]
        self.std = [0.229, 0.224, 0.225]

    def __call__(self, image, target):
        ow, oh = image.size
        image = image.resize((self.size, self.size), Image.BILINEAR)
        tensor = to_tensor(image)
        tensor = normalize(tensor, mean=self.mean, std=self.std)
        sx = self.size / ow
        sy = self.size / oh
        for obj in target:
            x, y, w, h = obj["bbox"]
            obj["bbox"] = [x * sx, y * sy, w * sx, h * sy]
        return tensor, target

def collate_fn(batch):
    return tuple(zip(*batch))

# 4) DataLoaders
train_loader = DataLoader(
    CocoDetection(IMG_DIR, ANN_TRAIN, transforms=TrainTransform()),
    batch_size=2, shuffle=True, num_workers=4, pin_memory=True,
    collate_fn=collate_fn
)
val_loader = DataLoader(
    CocoDetection(IMG_DIR, ANN_VAL, transforms=EvalTransform()),
    batch_size=4, shuffle=False, num_workers=4, pin_memory=True,
    collate_fn=collate_fn
)

# 5) Model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
# custom anchors for small objects
model.rpn.anchor_generator = AnchorGenerator(
    sizes=((16,), (32,), (64,), (128,), (256,)),
    aspect_ratios=((0.5, 1.0, 2.0),) * 5
)
# replace head
in_feats = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_feats, 11)
# dropout
orig_cls = model.roi_heads.box_predictor.cls_score
model.roi_heads.box_predictor.cls_score = nn.Sequential(nn.Dropout(0.3), orig_cls)
model.to(device)

# 6) Optimizer and schedulers
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)  # increased LR for faster convergence
EPOCHS = 100
steps_per_epoch = len(train_loader)
onecycle = OneCycleLR(optimizer, max_lr=5e-5, total_steps=EPOCHS * steps_per_epoch, pct_start=0.1)
plateau = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6)
scaler = GradScaler()

# 7) Freeze backbone
for p in model.backbone.body.parameters():
    p.requires_grad = False
unfreeze_epoch = 10
backbone_unfrozen = False

# 8) Target prep
def to_valid(images, targets):
    v_imgs, v_tgts = [], []
    for img, tgt in zip(images, targets):
        boxes, labels = [], []
        for obj in tgt:
            x, y, w, h = obj["bbox"]
            if w > 0 and h > 0:
                boxes.append([x, y, x + w, y + h])
                labels.append(int(obj["category_id"]))
        if boxes:
            v_imgs.append(img.to(device))
            v_tgts.append({
                "boxes": torch.tensor(boxes, dtype=torch.float32, device=device),
                "labels": torch.tensor(labels, dtype=torch.int64, device=device)
            })
    return v_imgs, v_tgts

# 9) Train/val loops
def train_epoch(epoch):
    global backbone_unfrozen
    model.train()
    if epoch == unfreeze_epoch and not backbone_unfrozen:
        for p in model.backbone.body.parameters(): p.requires_grad = True
        backbone_unfrozen = True
        print(f"*** Backbone descongelado na Ã©poca {epoch} ***")
    running, count = 0.0, 0
    for i, (imgs, tgts) in enumerate(train_loader, start=1):
        imgs, tgts = to_valid(imgs, tgts)
        if not tgts: continue
        with autocast("cuda"):
            loss_dict = model(imgs, tgts)
            raw_loss = sum(loss_dict.values())
            # normalize by batch size
            loss = raw_loss / len(imgs)
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        step_ok = scaler.step(optimizer)
        scaler.update()
        if step_ok is not None:
            onecycle.step()
        running += loss.item()
        count += 1
        if i % 10 == 0:
            print(f"[E{epoch:03d}|B{i:04d}/{steps_per_epoch:04d}] loss={loss.item():.4f}")
    avg = running / count if count else float('nan')
    print(f"[Epoch {epoch:03d}] train avg loss: {avg:.4f}\n")
    torch.cuda.empty_cache()
    return avg

def val_epoch(epoch):
    model.train()
    total, count = 0.0, 0
    with torch.no_grad():
        for imgs, tgts in val_loader:
            imgs, tgts = to_valid(imgs, tgts)
            if not tgts: continue
            with autocast("cuda"):
                total += sum(model(imgs, tgts).values()).item()
            count += 1
    avg = total / count if count else float('nan')
    print(f"[Epoch {epoch:03d}] val avg loss: {avg:.4f}\n")
    plateau.step(avg)
    return avg

# 10) Main loop
best, wait, patience = float('inf'), 0, 30
for ep in range(1, EPOCHS+1):
    tr = train_epoch(ep)
    vl = val_epoch(ep)
    torch.save(model.state_dict(), f"epoch_{ep:03d}.pth")
    if vl < best:
        best, wait = vl, 0
        torch.save(model.state_dict(), "best.pth")
        print("++ best saved\n")
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping")
            break
# 11) final save
torch.save(model.state_dict(), "final.pth")
print("Training complete.")

loading annotations into memory...
Done (t=1.20s)
creating index...
index created!
loading annotations into memory...
Done (t=0.38s)
creating index...
index created!
[E001|B0010/25808] loss=18.0398
[E001|B0020/25808] loss=22.1314
[E001|B0030/25808] loss=24.2947
[E001|B0040/25808] loss=13.9815
[E001|B0050/25808] loss=10.6287
[E001|B0060/25808] loss=10.4198


KeyboardInterrupt: 