In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from pycocotools.coco import COCO
import os
from PIL import Image
import torchvision.transforms as transforms

import torch.optim as optim
from tqdm import tqdm

In [2]:
import os
from pycocotools.coco import COCO
from torch.utils.data import Dataset
import torch
from PIL import Image

class LicensePlateDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, max_bboxes=10):
        self.coco = COCO(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.ids = list(self.coco.imgs.keys())
        self.max_bboxes = max_bboxes

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        img_info = self.coco.loadImgs(img_id)[0]
        img_path = os.path.join(self.img_dir, img_info['file_name'])
        image = Image.open(img_path).convert('RGB')

        boxes = []
        for ann in anns:
            x, y, width, height = ann['bbox']
            boxes.append([x, y, x + width, y + height])

        boxes = self.pad_bounding_boxes(boxes)

        if self.transform:
            image = self.transform(image)

        target = torch.as_tensor(boxes, dtype=torch.float32)
        return image, target

    def pad_bounding_boxes(self, bboxes):
        padded_bboxes = torch.zeros((self.max_bboxes, 4))
        n_boxes = min(len(bboxes), self.max_bboxes)
        padded_bboxes[:n_boxes] = torch.as_tensor(bboxes[:n_boxes])
        return padded_bboxes


In [3]:
dataset_path = "./datasetPlacas/"

In [4]:
coco = COCO(os.path.join(dataset_path, "train", "_annotations.coco.json"))
categories = coco.cats
n_classes = len(categories.keys())
# categories


loading annotations into memory...
Done (t=0.10s)
creating index...
index created!


# Arquitetura da rede

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += residual
        out = self.relu(out)
        return out

class LicensePlateRecognitionNet(nn.Module):
    def __init__(self):
        super(LicensePlateRecognitionNet, self).__init__()

        self.initial_block = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
        )

        self.residual_blocks = nn.Sequential(
            *[ResidualBlock(128, 128) for _ in range(14)]
        )

        self.conv_block = nn.Sequential(
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.detection = nn.Conv2d(1024, 8, kernel_size=1)  # 2 for object/non-object probabilities + 6 for affine transformation parameters

    def forward(self, x):
        out = self.initial_block(x)
        out = self.residual_blocks(out)
        out = self.conv_block(out)
        out = self.detection(out)
        return out




In [6]:
model = LicensePlateRecognitionNet()

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")
model.to(device)

Using device: cuda


LicensePlateRecognitionNet(
  (initial_block): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
  )
  (residual_blocks): Sequential(
    (0): ResidualBlock(
      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): ResidualBlock(
      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1,

# Função de perda

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class WPODLoss(nn.Module):
    def __init__(self, alpha=7.75, ns=2**4, gamma_obj=0.3):
        super(WPODLoss, self).__init__()
        self.alpha = alpha
        self.ns = ns
        self.gamma_obj = gamma_obj

    def forward(self, pred, target, object_mask):
        batch_size, _, H, W = pred.shape
        M, N = H, W  # Feature map dimensions

        # Separate the predicted values
        v1 = pred[:, 0, :, :]  # Object probabilities
        v2 = pred[:, 1, :, :]  # Non-object probabilities
        v3 = torch.max(pred[:, 2, :, :], torch.tensor(0.0, device=pred.device))
        v4 = pred[:, 3, :, :]
        v5 = pred[:, 4, :, :]
        v6 = torch.max(pred[:, 5, :, :], torch.tensor(0.0, device=pred.device))
        v7 = pred[:, 6, :, :]
        v8 = pred[:, 7, :, :]

        # Define the canonical square vertices
        q = torch.tensor([[-0.5, -0.5], [0.5, -0.5], [0.5, 0.5], [-0.5, 0.5]], device=pred.device).unsqueeze(0).unsqueeze(0).unsqueeze(0)  # Shape: (1, 1, 1, 4, 2)
        
        # Compute Tmn(q)
        Tmn_q = torch.zeros((batch_size, M, N, 4, 2), device=pred.device)
        for i in range(4):
            Tmn_q[..., i, 0] = v3 * q[..., i, 0] + v4 * q[..., i, 1] + v7
            Tmn_q[..., i, 1] = v5 * q[..., i, 0] + v6 * q[..., i, 1] + v8

        # Normalize the annotated points pi
        A_p = torch.zeros_like(target)
        A_p[..., 0] = (1 / self.alpha) * (1 / self.ns) * target[..., 0] - torch.arange(N, device=pred.device).float().unsqueeze(0).unsqueeze(-1) * (1 / self.alpha)
        A_p[..., 1] = (1 / self.alpha) * (1 / self.ns) * target[..., 1] - torch.arange(M, device=pred.device).float().unsqueeze(1).unsqueeze(-1) * (1 / self.alpha)
        
        # Compute faffine(m, n)
        f_affine = torch.sum(torch.abs(Tmn_q - A_p), dim=[3, 4])

        # Compute fprobs(m, n)
        logloss = lambda y, p: -y * torch.log(p + 1e-10) - (1 - y) * torch.log(1 - p + 1e-10)
        f_probs = logloss(object_mask, v1) + logloss(1 - object_mask, v2)
        
        # Combine both parts of the loss
        loss = torch.sum(object_mask * f_affine + f_probs)

        return loss




In [9]:
loss_fn = WPODLoss()

In [10]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

## Carregando o Dataset

In [12]:
annotations_file = './datasetPlacas/train/_annotations.coco.json'
img_dir = './datasetPlacas/train/'

subset_size = 100  # Defina o tamanho do subset desejado
train_subset_indices = list(range(subset_size))

dataset = LicensePlateDataset(annotations_file=annotations_file, img_dir=img_dir, transform=transform)

train_subset = Subset(dataset, train_subset_indices)

data_loader = DataLoader(train_subset, batch_size=2, shuffle=True, num_workers=1)

print(len(data_loader))


loading annotations into memory...
Done (t=0.10s)
creating index...
index created!
50


In [13]:
from torchvision import transforms
from torch.utils.data import DataLoader

# Definir transformações (se necessário)
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

# Inicializar o dataset e o DataLoader
annotations_file = './datasetPlacas/train/_annotations.coco.json'
img_dir = './datasetPlacas/train/'

dataset = LicensePlateDataset(annotations_file=annotations_file, img_dir=img_dir, transform=transform, max_bboxes=10)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

# Testar o DataLoader
for images, targets in data_loader:
    print("Images batch shape:", images.shape)
    print("Targets batch shape:", targets.shape)
    break


loading annotations into memory...
Done (t=0.09s)
creating index...
index created!
Images batch shape: torch.Size([4, 3, 256, 256])
Targets batch shape: torch.Size([4, 10, 4])


In [14]:
for images, targets in data_loader:
    print(images.shape)
    print(targets.shape)
    break


torch.Size([4, 3, 256, 256])
torch.Size([4, 10, 4])


# Loop de treino

In [15]:
import torch
from tqdm import tqdm

# Definir hiperparâmetros
num_epochs = 10
learning_rate = 0.001

# Inicializar modelo, função de perda e otimizador
model = LicensePlateRecognitionNet().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = WPODLoss()

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for images, targets in tqdm(data_loader):
        images = images.to(device)
        targets = targets.to(device)

        batch_size = targets.size(0)
        M, N = 256, 256

        # Predição
        pred = model(images)

        object_mask = torch.zeros((batch_size, M, N), device=device)
        target_points = torch.zeros((batch_size, M, N, 4, 2), device=device)

        for i in range(batch_size):
            for box in targets[i]:
                if torch.sum(box) == 0:
                    continue
                x1, y1, x2, y2 = box
                cx, cy = (x1 + x2) / 2, (y1 + y2) / 2
                mnx, mny = int(cx / N), int(cy / M)
                object_mask[i, mny, mnx] = 1
                target_points[i, mny, mnx, :, :] = torch.tensor([
                    [x1, y1],
                    [x2, y1],
                    [x2, y2],
                    [x1, y2]
                ]) - torch.tensor([mnx * N, mny * M])

        # Cálculo da perda
        loss = loss_fn(pred, target_points, object_mask)
        epoch_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(data_loader)}')

torch.save(model.state_dict(), 'license_plate_detector.pth')


  0%|          | 0/4632 [00:04<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 5.21 GiB is allocated by PyTorch, and 49.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)