# SINet COD10K Detection
Implementazione migliorata di SINet per il rilevamento di oggetti mimetizzati.

In [159]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
import os
from PIL import Image
from sklearn.metrics import jaccard_score
import matplotlib.pyplot as plt
import random
import torch.nn.functional as F

In [160]:
# device = torch.device('cuda')
device = torch.device('mps')

In [161]:
class CODDataset(Dataset):
    def __init__(self, image_folder, mask_folder,
                 image_transform=None, mask_transform=None):
        self.image_files = os.listdir(image_folder)
        self.image_folder = image_folder
        self.mask_folder = mask_folder
        self.image_transform = image_transform
        self.mask_transform = mask_transform

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_folder, self.image_files[idx])
        mask_path = os.path.join(self.mask_folder, self.image_files[idx].replace('.jpg', '.png'))

        image = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path).convert("L")

        if self.image_transform:
            image = self.image_transform(image)
        if self.mask_transform:
            mask = self.mask_transform(mask)

        return image, mask

    def __len__(self):
        return len(self.image_files)

In [162]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean() if self.reduction == 'mean' else F_loss.sum()

In [163]:
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.GaussianBlur(kernel_size=(3, 3)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

mask_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(15),
    transforms.GaussianBlur(kernel_size=(3, 3)),
    transforms.ToTensor()
])


In [164]:
train_dataset = CODDataset(
    image_folder="COD10K-v3/Train/Image",
    mask_folder="COD10K-v3/Train/GT_Object",
    image_transform=image_transform,
    mask_transform=mask_transform
)

test_dataset = CODDataset("COD10K-v3/Test/Image",
                          "COD10K-v3/Test/GT_Object",
                          image_transform=image_transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0, pin_memory=True)

In [165]:
class SINet(nn.Module):
    def __init__(self):
        super(SINet, self).__init__()
        self.backbone = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.DEFAULT)
        self.backbone_features = self.backbone.features
        self.edge_attention = nn.Sequential(
            nn.Conv2d(1792, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 1, kernel_size=1),
            nn.Sigmoid()
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(1792 + 256, 512, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.ConvTranspose2d(64, 1, kernel_size=2, stride=2),
            nn.Sigmoid()
        )

    def forward(self, x):
        features = self.backbone_features(x)
        edge_map = self.edge_attention(features)
        edge_map = torch.nn.functional.interpolate(edge_map, size=features.shape[2:], mode='bilinear', align_corners=False)
        combined = torch.cat((features, edge_map.expand(-1, 256, -1, -1)), dim=1)
        output = self.decoder(combined)
        return output

In [166]:
model = SINet().to(device)
criterion = FocalLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-6)

In [167]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for images, masks in train_loader:
        images, masks = images.to(device), masks.to(device)
        preds = model(images)
        preds = nn.functional.interpolate(preds, size=masks.shape[2:], mode='bilinear', align_corners=False)
        loss = criterion(preds, masks)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}')

Epoch 1, Loss: 0.0174
Epoch 2, Loss: 0.0133
Epoch 3, Loss: 0.0125
Epoch 4, Loss: 0.0123
Epoch 5, Loss: 0.0121


In [168]:
torch.save(model.state_dict(), 'sinet_cod_model.pth')

In [169]:
def compute_iou(mask_np, pred_np):
    if mask_np.sum() == 0:
        return None
    return jaccard_score(mask_np, pred_np, zero_division=1)

def visualize_predictions(model, dataloader, num_images=10):
    model.eval()
    iou_scores = []
    with torch.no_grad():
        for i, (image, mask) in enumerate(dataloader):
            if i >= num_images:
                break
            image = image.unsqueeze(0).to(device)
            mask = mask.to(device)
            pred = model(image)
            pred = (pred > 0.5).float()
            mask_np = (mask.squeeze().cpu().numpy() > 0.5).astype(int)
            pred_np = (pred.squeeze().cpu().numpy() > 0.5).astype(int)
            iou = compute_iou(mask_np, pred_np)
            if iou is not None:
                iou_scores.append(iou)
    print(f"Mean IoU (filtered): {sum(iou_scores) / len(iou_scores):.2f}")

visualize_predictions(model, test_loader, num_images=10)

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>