In [1]:
from PIL import Image
import os
import cv2
import numpy as np
import math

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.utils.data as data 
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from torchvision import transforms, datasets, models
from torchvision.io import read_image, read_video

## Matplotlib
import matplotlib.pyplot as plt

## Yolo
from ultralytics import YOLO

In [2]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)


set_seed(42)

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

### Preparazione dei dati
Il dataset originario era composto da 26067 immagini, mentre per motivi logistici in questo notebook è stato ridotto a 9014 immagini e 11 classi (invece delle 32 originarie). Per carciare le immagini usiamo la funzionalità `ImageLoader` presente in Pytorch che permette di estrarre tutte le immagini presenti in una cartella (e nelle sue sottocartelle) e assegna come lable a tali immagini il nome della cartella stessa.

In [3]:
dataset_path = "../archive"

class ImageFolderWithIndices(datasets.ImageFolder):
    def __getitem__(self, index):
        
        image, label = super().__getitem__(index)

        path = self.imgs[index][0]
        filename = os.path.basename(path)

        str_num = ''
        for elem in filename:
            if elem.isdigit():
                str_num += elem

        num = (int) (str_num)
    
        return image, label, num
    
dataset = ImageFolderWithIndices(root=dataset_path)
#print(dataset.classes)
#print(len(dataset))

'''
    Il dataset viene suddiviso nel modo seguente:
        - 80% training_set
        - 10% validation_set
        - 10% test_set
'''

train_set, val_set, test_set = torch.utils.data.random_split(dataset, [7212, 901, 901])


#### Estrazione dei bounding boxes

In [5]:
''' 
bbox = [x_top_left, y_top_left, width, height, index], dove index è l'indice dell'immagine nella cartella in cui è salvata,
questo serve per ricreare la corrispondenza fra immagine e lable
'''

bounding_boxes = []

entries = sorted(os.listdir(dataset_path))
for folder in entries:
    sub_directory = os.path.join(dataset_path, folder)
    tensor_list = []

    for filename in os.listdir(sub_directory):
        if filename.endswith("groundtruth_rect.txt"):
            path = os.path.join(sub_directory, filename)
            f = open(path, "r")
            lines = f.readlines()
            len_file = len(lines)

            for i in range(len_file):
                line = lines[i].split()
                floates = [float(x) for x in line]
                index = (float)(i)
                floates.append(index)
                coordinates = torch.tensor(floates)
                tensor_list.append(coordinates)
        
    bounding_boxes.append(tensor_list)


#### Preparazione del dataset per l'addestramento di YOLOv8 e ResNet18

In [6]:
dataset_dir = "datasets/dataset"
resnet_dir = "resnet_dir"

# creazione delle directory per Yolov8

image_dirs = {
    "train" : os.path.join(dataset_dir, "images/train"),
    "val": os.path.join(dataset_dir, "images/val"),
    "test": os.path.join(dataset_dir, "images/test")
}

label_dirs = {
    "train": os.path.join(dataset_dir, "labels/train"),
    "val": os.path.join(dataset_dir, "labels/val"),
    "test": os.path.join(dataset_dir, "labels/test")
}

# Creazione delle directory per ResNet50

dirs = {
    "train" : os.path.join(resnet_dir, "train"),
    "val" : os.path.join(resnet_dir, "val"),
    "test" : os.path.join(resnet_dir, "test")
}

# Creazione delle cartelle

for dir_path in image_dirs.values():
    os.makedirs(dir_path, exist_ok=True)
for dir_path in label_dirs.values():
    os.makedirs(dir_path, exist_ok=True)
for dir_path in dirs.values():
    os.makedirs(dir_path, exist_ok=True)

# Salvataggio delle delle immagini e dei bounding boxes

def save_images_and_labels(data, dataset_type):
    image_dir = image_dirs[dataset_type]
    label_dir = label_dirs[dataset_type]
    cropped_image_dir = dirs[dataset_type]

    for image, label, index in data:
        img_save_path = os.path.join(image_dir, f"{index}_{label}.jpg")
        image.save(img_save_path)

        width = image.width
        height = image.height

        bounding_box = bounding_boxes[label][index-1]
        bbox_width = bounding_box[2]
        bbox_height = bounding_box[3]
        
        class_id = label

        label_save_path = os.path.join(label_dir, f"{index}_{label}.txt")
        with open(label_save_path, 'w') as f:

            # normalizziamo il bbox

            x_center = (bounding_box[0] + bbox_width/2) / width
            y_center = (bounding_box[1] + bbox_height/2) / height
            b_width = bbox_width / width
            b_height = bbox_height / height

            f.write(f"{class_id} {x_center} {y_center} {b_width} {b_height}\n")

        # Ritagliamo le immagini tenendoci solo la parte individuata dai bounding boxes

        bbox_width = float(bbox_width)
        bbox_height = float(bbox_height)
        
        x_min, y_min = round(float(bounding_box[0]), 1), round(float(bounding_box[1]), 1)
        x_max = round(float(x_min + bbox_width), 1)
        y_max = round(float(y_min + bbox_height), 1)

        # Check che verifica che non si esca dai bordi dell'immagine o che non ci siano bounding boxes con 
        # altezza o spessore nullo
        if x_min < 0 or y_min < 0 or x_max >= width or y_max >= height or x_max - x_min <= 0 or y_max - y_min <= 0:
            continue
        
        cropped_image = image.crop((x_min, y_min, x_max, y_max))
        cropped_save_path = os.path.join(cropped_image_dir, f"{index}_{label}.jpg")
        cropped_image.save(cropped_save_path)

        

In [7]:
save_images_and_labels(train_set, "train")
save_images_and_labels(test_set, "test")
save_images_and_labels(val_set, "val")

### Yolov8

In [23]:
model = YOLO("best.pt")

In [None]:
result = model.train(data="file.yaml", epochs=15, batch=32)

In [None]:
results = model.predict("datasets/dataset/images/test/10_7.jpg", save=False, show=True)

#### Tracking con Yolov8

In [None]:
res1 = model.track(source="Tracking_test/Ballena/Ballena.mp4", conf=0.3, iou=0.6, show=True)

In [None]:
res2 = model.track(source="Tracking_test/SeaDiver/SeaDiver.mp4", conf=0.3, iou=0.5, show=True)

In [None]:
res3 = model.track(source="Tracking_test/Octopus2/Octopus2.mp4", conf=0.3, iou=0.5, show=True)

#### Addestramento di ResNet50 per l'estrazione delle feature dai bounding boxes

Nelle prime due celle di codice estraiamo le immagini per al fine di calcolare la loro media e deviazione standard per poterle normalizzare in modo che complessivamente la media e la deviazione standard su tutto il dataset siano approssimativamente 0 e 1.

In [8]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

tmp_dataset = datasets.ImageFolder(root=resnet_dir, transform=transform)

In [97]:
# Andiamo a calcolare la media e la deviazione standard per 
# poter poi normalizzare le immagini

dataset_size = len(tmp_dataset)

DATA_MEAN = 0.0
var = 0.0

for i in range(dataset_size):
    image, _ = tmp_dataset[i]
    DATA_MEAN += image.mean(dim=(1, 2))
    var += image.var(dim=(1, 2))
    

DATA_MEAN /= dataset_size
DATA_STD = torch.sqrt(var/dataset_size)

print(f"Mean: {DATA_MEAN}")
print(f"std: {DATA_STD}")

""" 
Mean: tensor([0.2522, 0.4443, 0.4805])
std: tensor([0.1188, 0.1406, 0.1375])
"""

Mean: tensor([0.2522, 0.4443, 0.4805])
std: tensor([0.1188, 0.1406, 0.1375])


In [4]:
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.2522, 0.4443, 0.4805], [0.1188, 0.1406, 0.1375])
])

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.2522, 0.4443, 0.4805], [0.1188, 0.1406, 0.1375])
])

train_dataset_path = "resnet_dir/train1/train"
val_dataset_path = "resnet_dir/val1/val"
test_dataset_path = "resnet_dir/test1/test"

class CostumImageDataset(Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.image_filenames = os.listdir(image_folder)
        self.transform = transform

    def __len__(self):
        return len(self.image_filenames)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.image_folder, self.image_filenames[idx])
        image = Image.open(img_path)
        filename = self.image_filenames[idx].split('_')
        category = filename[1].split('.')
        label = int(category[0])

        if self.transform:
            image = self.transform(image)
        
        return (image, label)
    

train_dataset = CostumImageDataset(train_dataset_path, train_transform)
val_dataset = CostumImageDataset(val_dataset_path, test_transform)
test_dataset = CostumImageDataset(val_dataset_path, test_transform)

train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False)

"""

model.fc = nn.Identity()

features = model(image)

"""


'\n\nmodel.fc = nn.Identity()\n\nfeatures = model(image)\n\n'

In [20]:
imgs, _ = next(iter(train_loader))
print("Batch mean", imgs.mean(dim=[0,2,3]))
print("Batch std", imgs.std(dim=[0,2,3]))

Batch mean tensor([-0.1059, -0.0629,  0.0056])
Batch std tensor([1.3584, 1.3395, 1.6535])


In [None]:
def train_resnet18(model, optimizer, data_loader, loss_module, num_epochs):
    model.train()

    # Parallelize training accross multiple GPUs
    # model = torch.nn.DataParallel(model)

    loss = 0.0
    
    for epoch in range(num_epochs):
        for inputs, labels in data_loader:
            # Questo passaggio è strettamente necessrio solo se si usa una gpu
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Forward pass
            preds = model(inputs)
            preds = preds.squeeze(dim=1)

            loss = loss_module(preds, labels)

            # Prima di calcolare i gradienti ci assicuriamo che siano tutti zero
            optimizer.zero_grad()

            # Backward pass
            loss.backward()

            # Aggiornamento dei parametri
            optimizer.step()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss}')


In [None]:
resnet_model = models.resnet18(pretrained=True)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, 11)
resent_model = resnet_model.to(device)

loss_module = nn.CrossEntropyLoss()
optimizer = optim.SGD(resnet_model.parameters(), lr=0.001, momentum=0.9)
num_epochs = 50
batch_size = 32

train_resnet18(resnet_model, optimizer, train_loader, loss_module, num_epochs)

#### Salvataggio del modello

In [None]:
state_dict = resnet_model.state_dict()
torch.save(state_dict, "my_resnet_model.pt")

"""
# Specify a path
PATH = "state_dict_model.pt"

# Save
torch.save(net.state_dict(), PATH)

# Load
model = Net()
model.load_state_dict(torch.load(PATH, weights_only=True))
model.eval()
"""

# Salva l'intero modello
torch.save(resnet_model, 'entire_model.pt')

#### Valuatazione del modello

In [10]:
def evaluate_model(model, data, loss_module):

    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in data:
            
            #inputs = inputs.to(device)
            #labels = labels.to(device)

            preds = model(inputs)
            loss = loss_module(preds, labels)

            val_loss += loss.item()
            _, predicted = preds.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    val_loss /= len(val_loader)
    accuracy = 100. * correct / total

    return accuracy


In [12]:
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 11)
model.load_state_dict(torch.load("my_resnet_model.pt", map_location='cpu'))
model.eval()
loss_module = nn.CrossEntropyLoss()

acc = evaluate_model(model, test_loader, loss_module)

print("accuarcy: ", acc)

accuarcy:  100.0


### DeepSORT