<a href="https://colab.research.google.com/github/Light896cart/CV/blob/main/CV/Y/YOLOv1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

import torchvision
import torchvision.transforms as transforms

import cv2
import math
import xml.etree.ElementTree as ET
from PIL import Image
import matplotlib.pyplot as plt

import os

In [2]:
torch.set_printoptions(precision=5, sci_mode=False)

In [3]:
class DataFurcts(Dataset):
  def __init__(self,path,transform):
    self.path = path
    self.transform = transform
    self.size_img = (448,448)

    xml_files = [xml_files for xml_files in os.listdir(path) if xml_files.endswith('.xml')]
    self.dataset = list(self._XML_data(xml_files,path))  # Изменено на список


  def __getitem__(self, idx):
    sample = self.dataset[idx]['filename']
    img = Image.open(f'{self.path}/{sample}').convert('RGB')

    original_size = img.size

    if self.transform:
      img = self.transform(img)


    # Получаем размеры изображения для нормализации
    width, height = img.shape[1:3]

    # Нормализуем координаты
    bndbox = self.dataset[idx]['bndbox']


    original_bbox = torch.tensor([bndbox['xmin'],bndbox['ymin'],bndbox['xmax'],bndbox['ymax']])
    new_bbox = self.resize_bbox(original_bbox,original_size)

    x = torch.floor((new_bbox['xmin'] + new_bbox['xmax']) / 2 / 64).long()
    y = torch.floor((new_bbox['ymin'] + new_bbox['ymax']) / 2 / 64).long()

    normalized_x = (x % 48) / 48
    normalized_y = (y % 48) / 48

    width_norm = new_bbox['xmax'] - new_bbox['xmin']
    height_norm = new_bbox['ymax'] - new_bbox['ymin']

    cells = torch.zeros(49)
    index = x * 7 + y

    # Проверяем выход за пределы
    if index < 49:
        cells[index] = 1

    normalization = {
        'x': normalized_x,
        'y': normalized_y,
        'width': width_norm,
        'height': height_norm,

    }
    reg = torch.stack(list(normalization.values()), dim=0).view(-1,4)
    # Округление до 4 знаков после запятой
    rounded_tensor = torch.round(reg * 100000) / 100000
    return img, new_bbox, reg,cells


  def __len__(self):
    return len(self.dataset)

  def resize_bbox(self,bbox, original_size):
    original_width, original_height = original_size
    new_width, new_height = self.size_img

    scale_x = new_width / original_width
    scale_y = new_height / original_height

    x_min, y_min, x_max, y_max = bbox

    bndbox_disclosed = {
        'xmin': x_min * scale_x,
        'ymin': y_min * scale_y,
        'xmax': x_max * scale_x,
        'ymax': y_max * scale_y
    }

    return bndbox_disclosed

  def _XML_data(self, xml_files, path):
    for file in xml_files:
        try:
            # Загрузка и парсинг XML файла
            tree = ET.parse(os.path.join(path, file))
            root = tree.getroot()
            # Извлечение данных
            filename = root.find('filename').text

            # Извлечение координат из bndbox
            bndbox = root.find('./object/bndbox')
            if bndbox is None:
                continue  # Переход к следующему файлу, если bndbox отсутствует

            xmin = float(bndbox.find('xmin').text)
            ymin = float(bndbox.find('ymin').text)
            xmax = float(bndbox.find('xmax').text)
            ymax = float(bndbox.find('ymax').text)



            yield {'filename': filename, 'bndbox': {'xmin':torch.tensor(xmin), 'ymin': torch.tensor(ymin), 'xmax': torch.tensor(xmax), 'ymax': torch.tensor(ymax)}}
        except Exception as e:
                print(f"Ошибка {file}: {e}")

In [4]:
transform = transforms.Compose([
            transforms.ToTensor(),  # Преобразует изображение в тензор
            transforms.Resize((448,448)),
            ])

In [5]:
path = '/content/fructs/train_zip/train'
daset = DataFurcts(path,transform)
data_train = DataLoader(daset,shuffle=True,batch_size=8)

In [6]:
class YOLODetection(nn.Module):
    def __init__(self, input_channels, S, C, B, batch_size):
        super(YOLODetection, self).__init__()

        self.batch_size = batch_size
        self.S = S
        self.C = C

        self.backbone = nn.Sequential(
            self._make_layer(input_channels, 64, kernel_size=7, stride=2, padding=3),
            nn.MaxPool2d(kernel_size=2, stride=2),
            self._make_layer(64, 192, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=2, stride=2),
            self._make_segment_3(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            self._make_segment_4(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            self._make_segment_5(),
            nn.Flatten(),
            nn.Linear(in_features=50176, out_features=4096),
        )
        self.reg = nn.Linear(in_features=4096, out_features=4)

        self.clasific = nn.Linear(in_features=4096, out_features=49)



        # self._initialize_weights()

    def _make_layer(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        layers = [
            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
            nn.LeakyReLU()
        ]
        return nn.Sequential(*layers)

    def _make_segment_3(self):
        return nn.Sequential(
            self._make_layer(192, 128, 1),
            self._make_layer(128, 256, 3),
            self._make_layer(256, 256, 1),
            self._make_layer(256, 512, 3, padding=2)
        )

    def _make_segment_4(self):
      layers = nn.ModuleList()

      for _ in range(4):
          layers.append(self._make_layer(512, 256, kernel_size=1, padding=0))
          layers.append(self._make_layer(256, 512, kernel_size=3, padding=0))

      layers.append(self._make_layer(512, 512, 1,padding=2))
      layers.append(self._make_layer(512, 1024, 3,padding=3))

      return nn.Sequential(*layers)

    def _make_segment_5(self):
      layers = nn.ModuleList()

      for _ in range(2):
        layers.append(self._make_layer(1024, 512, kernel_size=1))
        layers.append(self._make_layer(512, 1024, kernel_size=1))

      layers.append(self._make_layer(1024, 1024, kernel_size=3,padding=1))
      layers.append(self._make_layer(1024, 1024, kernel_size=3,stride=2,padding=1))
      return nn.Sequential(*layers)

    def _initialize_weights(self):
      for m in self.modules():
          if isinstance(m, nn.Conv2d):
              nn.init.normal_(m.weight, mean=0.0, std=0.05)  # Инициализация весов по нормальному распределению
              if m.bias is not None:
                  nn.init.zeros_(m.bias)
          elif isinstance(m, nn.Linear):
              nn.init.normal_(m.weight, mean=0.0, std=0.05)  # Инициализация весов по нормальному распределению
              nn.init.zeros_(m.bias)


    # def _transform(self,tenz):
    #   return tenz.view(self.batch_size,5)

    def detection(self, tensor):
      # Получаем размеры батча и количество ячеек
      batch_size = tensor.size(0)
      num_cells = tensor.size(1)

      # Извлекаем уверенность и координаты ячеек
      confidence = tensor[:, :, 0]  # уверенность
      x_cell = tensor[:, :, 1]  # координаты в ячейке x
      y_cell = tensor[:, :, 2]  # координаты в ячейке y
      width_cell = tensor[:, :, 3]  # ширина
      height_cell = tensor[:, :, 4]  # высота

      # Преобразование x и y из ячейки в координаты изображения
      # Пропорциональное распределение по сетке
      grid_x = (x_cell + (torch.arange(num_cells).unsqueeze(0) % 7).float()) * (448 / 7)
      grid_y = (y_cell + (torch.arange(num_cells).unsqueeze(0) // 7).float()) * (448 / 7)

      xmin = grid_x - (width_cell / 2)
      ymin = grid_y - (height_cell / 2)
      xmax = grid_x + (width_cell / 2)
      ymax = grid_y + (height_cell / 2)

      # Объединяем тензоры
      output_tensor = torch.cat((confidence.unsqueeze(1), xmin.unsqueeze(1), ymin.unsqueeze(1), xmax.unsqueeze(1), ymax.unsqueeze(1)), dim=1).permute(0, 2, 1)
      return output_tensor

    def forward(self, x):
        x = self.backbone(x)
        reg = self.reg(x)
        reg_1_2 = torch.sigmoid(self.reg(x)[:, :2])

        reg_3_4 = self.reg(x)[:, 2:]
        reg = torch.cat((reg_1_2, reg_3_4), dim=1)
        clasific = self.clasific(x)
        return reg, clasific

In [7]:
# Первое, проверьте доступность GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = YOLODetection(input_channels=3,S=7,B=1,C=0,batch_size=1).to(device)

In [8]:
print(model)

YOLODetection(
  (backbone): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
      (1): LeakyReLU(negative_slope=0.01)
    )
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): Sequential(
      (0): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): LeakyReLU(negative_slope=0.01)
    )
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Sequential(
        (0): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1))
        (1): LeakyReLU(negative_slope=0.01)
      )
      (1): Sequential(
        (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
        (1): LeakyReLU(negative_slope=0.01)
      )
      (2): Sequential(
        (0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (1): LeakyReLU(negative_slope=0.01)
      )
      (3): Sequential(
        (0): Conv2d(256, 512, kerne

In [12]:
def loss_mse_yolos(prediction_S, target_S,prediction_reg,target_reg,lamda_coord, lamda_noobj):
  target_index_x,target_index_y = target_S.nonzero(as_tuple=True)
  target_index_null_x,target_index_null_y = (target_S == 0).nonzero(as_tuple=True)

  # print(f'PREDICTION {target_reg[:,0]}')
  x = (prediction_reg[:,0] - target_reg[:,0])**2
  y = (prediction_reg[:,1] - target_reg[:,1])**2



  local_one = lamda_coord * (x + y)[target_index_x]


  width = (torch.sqrt(prediction_reg[:,2]) - torch.sqrt(target_reg[:,2]))**2
  height = (torch.sqrt(prediction_reg[:,3]) - torch.sqrt(target_reg[:,3]))**2
  local_two = lamda_coord * (width + height)[target_index_x]

  C = (prediction_S - target_S) ** 2
  confidence_one = C[target_index_x]
  confidence_two = lamda_noobj * (C[target_index_null_x])

  loss = torch.mean(local_one) + torch.mean(local_two) + torch.mean(confidence_one) + torch.mean(confidence_two)
  return loss

In [14]:

# criterion_clas = nn.MSE()
optimizer = optim.Adam(model.clasific.parameters(),lr=1e-2)

In [15]:
lamda_coord = 5
lamda_noobj = 0.5

model.train()
for _ in range(50):
    for img, _, label_reg, label_clas in data_train:
      # Обнуление градиентов
        optimizer.zero_grad()
        img = img.to(device)
        label_reg = label_reg.to(device).squeeze(1)
        label_clas = label_clas.to(device)
        output_reg, output_clas = model(img)
        loss = loss_mse_yolos(output_clas,label_clas,output_reg,label_reg,lamda_coord,lamda_noobj)
        # loss = loss_mse_yolo(output_clas,label_clas)
        loss.backward()
        optimizer.step()
        print(loss)

tensor(2924.11060, grad_fn=<AddBackward0>)
tensor(2380.58325, grad_fn=<AddBackward0>)
tensor(2597.07031, grad_fn=<AddBackward0>)
tensor(2642.97241, grad_fn=<AddBackward0>)
tensor(2699.28516, grad_fn=<AddBackward0>)
tensor(3015.30811, grad_fn=<AddBackward0>)
tensor(2190.00244, grad_fn=<AddBackward0>)
tensor(2891.07520, grad_fn=<AddBackward0>)
tensor(2803.61279, grad_fn=<AddBackward0>)
tensor(3049.77026, grad_fn=<AddBackward0>)
tensor(2718.31909, grad_fn=<AddBackward0>)
tensor(2508.51733, grad_fn=<AddBackward0>)
tensor(2904.29102, grad_fn=<AddBackward0>)
tensor(2557.73169, grad_fn=<AddBackward0>)
tensor(2692.32739, grad_fn=<AddBackward0>)
tensor(2434.38745, grad_fn=<AddBackward0>)
tensor(3061.08008, grad_fn=<AddBackward0>)
tensor(3066.53760, grad_fn=<AddBackward0>)
tensor(2729.12549, grad_fn=<AddBackward0>)
tensor(1640.86096, grad_fn=<AddBackward0>)
tensor(2720.44287, grad_fn=<AddBackward0>)
tensor(2426.24561, grad_fn=<AddBackward0>)
tensor(2851.08862, grad_fn=<AddBackward0>)
tensor(2703

KeyboardInterrupt: 

In [None]:
def loss_mse_yolos(prediction_S, target_S,prediction_reg,target_reg,lamda_coord, lamda_noobj):
  target_index_x,target_index_y = target_S.nonzero(as_tuple=True)
  target_index_null_x,target_index_null_y = (target_S == 0).nonzero(as_tuple=True)
  print(f'X {target_index_x}')
  print(f'Y {target_index_y}')

  print()

  # print(f'PREDICTION {target_reg[:,0]}')
  x = (prediction_reg[:,0] - target_reg[:,0])**2
  y = (prediction_reg[:,1] - target_reg[:,1])**2



  local_one = lamda_coord * (x + y)[target_index_x]


  width = (torch.sqrt(prediction_reg[:,2]) - torch.sqrt(target_reg[:,2]))**2
  height = (torch.sqrt(prediction_reg[:,3]) - torch.sqrt(target_reg[:,3]))**2
  local_two = lamda_coord * (width + height)[target_index_x]

  C = (prediction_S - target_S) ** 2
  confidence_one = C[target_index_x]
  confidence_two = lamda_noobj * (C[target_index_null_x])

  print(f'local_one {local_one.shape}')
  print(f'local_two {local_two.shape}')
  print(f'confidence_one {confidence_one.shape}')
  print(f'confidence_two {confidence_two.shape}')

  loss = torch.mean(local_one) + torch.mean(local_two) + torch.mean(confidence_one) + torch.mean(confidence_two)
  print(loss)
  return loss

In [17]:
lamda_coord = 5
lamda_noobj = 0.5

prediction_S = torch.tensor([[1., 0.,0.,0.,0.],
                             [1., 0.,0.,0.,0.],
                             [1., 0.,0.,0.,0.]])

target_S = torch.tensor([[1., 0.,1.,0.,0.],
                         [1., 0.,0.,0.,0.],
                         [1., 0.,0.,0.,0.]])

label_reg = torch.tensor([[0.6, 0.2, 110, 220],[0.4, 0.5, 150, 180],[0.4, 0.5, 150, 180]])

prediction_reg = torch.tensor([[0.4, 0.2, 110, 220],[0.6, 0.5, 150, 180],[0.4, 0.5, 150, 180]])

print(prediction_S.shape)
loss_mse_yolos(prediction_S,target_S,prediction_reg,label_reg,lamda_coord,lamda_noobj)

torch.Size([3, 5])


tensor(0.27727)