## Import delle librerie

In [30]:
import json
import os
import torch
import random
import xml.etree.ElementTree as ET
import torchvision.transforms.functional as FT

import torch
from tqdm import tqdm
from pprint import PrettyPrinter


## Path

In [31]:
# path del dataset
base_dict = '/kaggle/input/our-xview-dataset'

# path della cartella contenente le immagini
img_dict = '/kaggle/input/our-xview-dataset/images'

# path dei file .txt da utilizzare per prelevare rispettivamente le immagini per il train, la validation e il test
# MANCANO -> servono per provare il codice
train_img_path = '/kaggle/input/our-xview-dataset/YOLO_cfg/train.txt' # file contenete i path delle immagini del dataset di train
val_img_path = '/kaggle/input/our-xview-dataset/YOLO_cfg/val.txt'
test_img_path = '/kaggle/input/our-xview-dataset/YOLO_cfg/test.txt'

# path contenente le annotazioni in formato .json
annotations = os.path.join(base_dict, 'COCO_annotations_new.json') # non va bene il file !
class_map = os.path.join(base_dict, 'xView_class_map.json') 

# path di output
output_folder = '/kaggle/working/'

# path file per il training
train_image = os.path.join(output_folder, 'TRAIN_images.json')
train_bbox = os.path.join(output_folder, 'TRAIN_objects.json')
train_label = os.path.join(output_folder, 'TRAIN_label_map.json')

# path file per la validation
val_image = os.path.join(output_folder, 'VAL_images.json')
val_bbox = os.path.join(output_folder, 'VAL_objects.json')
val_label = os.path.join(output_folder, 'VAL_label_map.json')

# path file per il test
test_image = os.path.join(output_folder, 'TEST_images.json')
test_bbox = os.path.join(output_folder, 'TEST_objects.json')
test_label = os.path.join(output_folder, 'TEST_label_map.json')

checkpoint_path = './checkpoint_ssd300.pth.tar'

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#cudnn.benchmark = True

## Pre-elaborazione del dataset

### creazione del file per reperire gli elementi del train e della validation

def create_data_lists(img_dict, train_img_path, val_img_path, test_img_path, output_folder): # VA MODIFICATA IN BASE AL DATASET
    """
    Create lists of images, the bounding boxes and labels of the objects in these images, and save these to file.
    - param output_folder: folder where the JSONs must be saved
    """
    
    # TRAIN
    train_images = list() #lista per la memorizzazione di tutte le immagini presenti nel dataset di train
    train_objects = list() # lista di bounding_boxes 
    train_label_map = label_map() #richiama la funzione label_map()
    n_objects = 0 # numero di bounding boxes nel train

    # Training data
    # Find IDs of images in training data
    with open(train_img_path) as f:
        ids = f.read().splitlines()

    for id in ids: #ricavo per ogni immagine nel set di training la lista di bbox con le corrispondenti labels
        # VA CAMBIATO IN BASE AL DATASET
        objects = parse_annotation(os.path.join(img_dict, 'Annotations', id + '.xml')) # richiama la funzione parse annotation -> vedi dopo
        if len(objects['boxes']) == 0: #verifico che ci siano bbox nell'immagine altrimenti passo alla prossima immagine
            continue
        n_objects += len(objects) 
        #inserisco i bbox e le labels nelle liste preposte
        train_objects.append(objects)
        train_images.append(os.path.join(img_dict, 'JPEGImages', id + '.jpg'))

    assert len(train_objects) == len(train_images)

    # Save to file
    with open(train_image, 'w') as j:
        json.dump(train_images, j)
    with open(train_bbox, 'w') as j:
        json.dump(train_objects, j)
    with open(train_label, 'w') as j:
        json.dump(train_label_map, j)  # save label map too

    print('\nThere are %d training images containing a total of %d objects. Files have been saved to %s.' % (
        len(train_images), n_objects, os.path.abspath(output_folder)))


    # VALIDATION
    val_images = list() #lista per la memorizzazione di tutte le immagini presenti nel dataset di train
    val_objects = list() # lista di bounding_boxes 
    val_label_map = label_map() #richiama la funzione label_map()
    n_objects = 0 # numero di bounding boxes nel train
    
    # Validation data
    # Find IDs of images in training data
    with open(val_img_path) as f:
        ids = f.read().splitlines()

    for id in ids: #ricavo per ogni immagine nel set di training la lista di bbox con le corrispondenti labels
        # VA CAMBIATO IN BASE AL DATASET
        objects = parse_annotation(os.path.join(img_dict, 'Annotations', id + '.xml'))
        if len(objects['boxes']) == 0: #verifico che ci siano bbox nell'immagine altrimenti passo alla prossima immagine
            continue
        n_objects += len(objects) 
        #inserisco i bbox e le labels nelle liste preposte
        val_objects.append(objects)
        val_images.append(os.path.join(img_dict, 'JPEGImages', id + '.jpg'))

    assert len(val_objects) == len(val_images)

    # Save to file
    with open(val_image, 'w') as j:
        json.dump(val_images, j)
    with open(val_bbox, 'w') as j:
        json.dump(val_objects, j)
    with open(val_label, 'w') as j:
        json.dump(val_label_map, j)  # save label map too

    print('\nThere are %d training images containing a total of %d objects. Files have been saved to %s.' % (
        len(train_images), n_objects, os.path.abspath(output_folder)))


    # TEST
    test_images = list()
    test_objects = list()
    n_objects = 0

    # Find IDs of images in the test data
    with open(os.path.join(test_img_path, 'ImageSets/Main/test.txt')) as f:
        ids = f.read().splitlines()

    for id in ids:
        # VA CAMBIATO IN BASE AL DATASET
        objects = parse_annotation(os.path.join(test_img_path, 'Annotations', id + '.xml'))
        if len(objects) == 0:
            continue
        test_objects.append(objects)
        n_objects += len(objects)
        test_images.append(os.path.join(test_img_path, 'JPEGImages', id + '.jpg'))

    assert len(test_objects) == len(test_images)

    # Save to file
    with open(test_image, 'w') as j:
        json.dump(test_images, j)
    with open(test_bbox, 'w') as j:
        json.dump(test_objects, j)

    print('\nThere are %d test images containing a total of %d objects. Files have been saved to %s.' % (
        len(test_images), n_objects, os.path.abspath(output_folder)))


In [33]:
def create_data_lists(img_dict, annotations_path, train_img_path, val_img_path, test_img_path, output_folder):
    """
    Create lists of images, bounding boxes, and labels, and save them to files.
    - param img_dict: Directory containing the images.
    - param annotations_path: Path to the JSON file with annotations.
    - param train_img_path: Path to file with IDs of training images.
    - param val_img_path: Path to file with IDs of validation images.
    - param test_img_path: Path to file with IDs of test images.
    - param output_folder: Folder where the JSONs must be saved.
    """

    # Load the annotations JSON
    with open(annotations_path, 'r') as f:
        annotations_data = json.load(f)

    def create_annotations_dict(data):
        """
        Organizza le annotazioni in un dizionario basato sull'image_id.
    
        :param data: Dizionario contenente "images" e "annotations".
        :return: Dizionario con image_id come chiave e lista di annotazioni come valore.
        """
        # Verifica che il formato dei dati sia corretto
        if "annotations" not in data or not isinstance(data["annotations"], list):
            raise ValueError("Il file JSON non contiene una chiave 'annotations' valida.")
        
        # Creazione del dizionario organizzato per image_id
        annotations_dict = {}
        for annotation in data["annotations"]:
            image_id = annotation["image_id"]  # Estrai l'image_id
            if image_id not in annotations_dict:
                annotations_dict[image_id] = []  # Inizializza una lista per il nuovo image_id
            annotations_dict[image_id].append(annotation)  # Aggiungi l'annotazione alla lista
    
        return annotations_dict


    # Convert annotations data into a dictionary for faster access
    annotations_dict = {str(item['id']): item for item in annotations_data}
    #annotations_for_image = annotations_dict.get(image_id, [])

    def process_image_set(image_ids_path):
        """
        Process a set of images (train, val, or test).
        """
        images = []
        objects = []
        n_objects = 0

        # Read image IDs
        with open(image_ids_path, 'r') as f: # da modificare perchè i path sono nel formato /kaggle/working/images/img_886_2240_2240.jpg e non /kaggle/input/our-xview-dataset/images/img_100_0_0.jpg 
            image_id = f.read().splitlines()
            image_ids = [line.strip().replace('/kaggle/working/images/', '/kaggle/input/our-xview-dataset/images/') for line in image_id]

        for image_id in image_ids:
            # Parse annotations for this image
            image_id = image_id
            annotation = parse_annotation(image_id, annotations_dict)

            # Skip images without bounding boxes
            if len(annotation['boxes']) == 0:
                continue

            # Add image path and annotations
            file_name = annotations_dict[str(image_id)]['file_name']
            images.append(os.path.join(img_dict, file_name))
            objects.append(annotation)
            n_objects += len(annotation['boxes'])

        return images, objects, n_objects

    # Process TRAINING data
    train_images, train_objects, train_n_objects = process_image_set(train_img_path)
    with open(os.path.join(output_folder, 'TRAIN_images.json'), 'w') as j:
        json.dump(train_images, j)
    with open(os.path.join(output_folder, 'TRAIN_objects.json'), 'w') as j:
        json.dump(train_objects, j)

    print(f'\nThere are {len(train_images)} training images containing a total of {train_n_objects} objects. '
          f'Files have been saved to {os.path.abspath(output_folder)}.')

    # Process VALIDATION data
    val_images, val_objects, val_n_objects = process_image_set(val_img_path)
    with open(os.path.join(output_folder, 'VAL_images.json'), 'w') as j:
        json.dump(val_images, j)
    with open(os.path.join(output_folder, 'VAL_objects.json'), 'w') as j:
        json.dump(val_objects, j)

    print(f'\nThere are {len(val_images)} validation images containing a total of {val_n_objects} objects. '
          f'Files have been saved to {os.path.abspath(output_folder)}.')

    # Process TEST data
    test_images, test_objects, test_n_objects = process_image_set(test_img_path)
    with open(os.path.join(output_folder, 'TEST_images.json'), 'w') as j:
        json.dump(test_images, j)
    with open(os.path.join(output_folder, 'TEST_objects.json'), 'w') as j:
        json.dump(test_objects, j)

    print(f'\nThere are {len(test_images)} test images containing a total of {test_n_objects} objects. '
          f'Files have been saved to {os.path.abspath(output_folder)}.')

## Conversione label/classi numeriche e viceversa

### associazione colori differenti ai bounding box relativi a classi differenti

In [34]:
def label_map():
    """
    Create a label map for the dataset, mapping class names to unique numeric identifiers.
    """
    labels = [
        "Fixed-wing Aircraft", "Small Aircraft", "Passenger/Cargo Plane", "Helicopter",
        "Passenger Vehicle", "Small Car", "Bus", "Pickup Truck", "Utility Truck", "Truck",
        "Cargo Truck", "Truck Tractor w/ Box Trailer", "Truck Tractor", "Trailer",
        "Truck Tractor w/ Flatbed Trailer", "Truck Tractor w/ Liquid Tank", "Crane Truck",
        "Railway Vehicle", "Passenger Car", "Cargo/Container Car", "Flat Car", "Tank car",
        "Locomotive", "Maritime Vessel", "Motorboat", "Sailboat", "Tugboat", "Barge",
        "Fishing Vessel", "Ferry", "Yacht", "Container Ship", "Oil Tanker", "Engineering Vehicle",
        "Tower crane", "Container Crane", "Reach Stacker", "Straddle Carrier", "Mobile Crane",
        "Dump Truck", "Haul Truck", "Scraper/Tractor", "Front loader/Bulldozer", "Excavator",
        "Cement Mixer", "Ground Grader", "Hut/Tent", "Shed", "Building", "Aircraft Hangar",
        "Damaged Building", "Facility", "Construction Site", "Vehicle Lot", "Helipad",
        "Storage Tank", "Shipping container lot", "Shipping Container", "Pylon", "Tower"
    ]
    
    # Generate a mapping from label names to unique numeric identifiers
    return {label: idx + 1 for idx, label in enumerate(labels)}


def rev_label_msp():
    label_map['background'] = 0 # classe dello sfondo
    rev_label_map = {v: k for k, v in label_map.items()}  # ricavo le lable dalle etichette numeriche
    return rev_label_map

def label_color_map():
    # Color map for bounding boxes of detected objects from https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/
    distinct_colors = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4', '#46f0f0', '#f032e6',
                   '#d2f53c', '#fabebe', '#008080', '#000080']
    label_color_map = {k: distinct_colors[i] for i, k in enumerate(label_map.keys())}
    return label_color_map
    

## Pre-elaborazione delle immagini

In [35]:
def resize(image, boxes, dims=(300, 300), return_percent_coords=True):
    """
    Resize image. For the SSD300, resize to (300, 300).

    Since percent/fractional coordinates are calculated for the bounding boxes (w.r.t image dimensions) in this process,
    you may choose to retain them.

    :param image: image, a PIL Image
    :param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
    :return: resized image, updated bounding box coordinates (or fractional coordinates, in which case they remain the same)
    """
    # Resize image
    new_image = FT.resize(image, dims)

    # Resize bounding boxes
    old_dims = torch.FloatTensor([image.width, image.height, image.width, image.height]).unsqueeze(0)
    new_boxes = boxes / old_dims  # percent coordinates

    if not return_percent_coords:
        new_dims = torch.FloatTensor([dims[1], dims[0], dims[1], dims[0]]).unsqueeze(0)
        new_boxes = new_boxes * new_dims

    return new_image, new_boxes


In [36]:
def transform(image, boxes, labels):
    """
    Apply the transformations above.

    :param image: image, a PIL Image
    :param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
    :param labels: labels of objects, a tensor of dimensions (n_objects)
    :param difficulties: difficulties of detection of these objects, a tensor of dimensions (n_objects)
    :param split: one of 'TRAIN' or 'TEST', since different sets of transformations are applied
    :return: transformed image, transformed bounding box coordinates, transformed labels, transformed difficulties
    """

    # Mean and standard deviation of ImageNet data that our base VGG from torchvision was trained on
    # see: https://pytorch.org/docs/stable/torchvision/models.html
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    new_image = image
    new_boxes = boxes
    new_labels = labels

    # Resize image to (300, 300) - this also converts absolute boundary coordinates to their fractional form
    new_image, new_boxes = resize(new_image, new_boxes, dims=(300, 300))

    # Convert PIL image to Torch tensor
    new_image = FT.to_tensor(new_image)

    # Normalize by mean and standard deviation of ImageNet data that our base VGG was trained on
    new_image = FT.normalize(new_image, mean=mean, std=std)

    return new_image, new_boxes, new_labels
    

## Gestione dei bounding boxes

In [37]:
def xy_to_cxcy(xy):
    """
    Convert bounding boxes from boundary coordinates (x_min, y_min, x_max, y_max) to center-size coordinates (c_x, c_y, w, h).

    :param xy: bounding boxes in boundary coordinates, a tensor of size (n_boxes, 4)
    :return: bounding boxes in center-size coordinates, a tensor of size (n_boxes, 4)
    """
    return torch.cat([(xy[:, 2:] + xy[:, :2]) / 2,  # c_x, c_y
                      xy[:, 2:] - xy[:, :2]], 1)  # w, h


def cxcy_to_xy(cxcy):
    """
    Convert bounding boxes from center-size coordinates (c_x, c_y, w, h) to boundary coordinates (x_min, y_min, x_max, y_max).

    :param cxcy: bounding boxes in center-size coordinates, a tensor of size (n_boxes, 4)
    :return: bounding boxes in boundary coordinates, a tensor of size (n_boxes, 4)
    """
    return torch.cat([cxcy[:, :2] - (cxcy[:, 2:] / 2),  # x_min, y_min
                      cxcy[:, :2] + (cxcy[:, 2:] / 2)], 1)  # x_max, y_max


In [38]:
def cxcy_to_gcxgcy(cxcy, priors_cxcy):
    """
    Encode bounding boxes (that are in center-size form) w.r.t. the corresponding prior boxes (that are in center-size form).

    For the center coordinates, find the offset with respect to the prior box, and scale by the size of the prior box.
    For the size coordinates, scale by the size of the prior box, and convert to the log-space.

    In the model, we are predicting bounding box coordinates in this encoded form.

    :param cxcy: bounding boxes in center-size coordinates, a tensor of size (n_priors, 4)
    :param priors_cxcy: prior boxes with respect to which the encoding must be performed, a tensor of size (n_priors, 4)
    :return: encoded bounding boxes, a tensor of size (n_priors, 4)
    """

    # The 10 and 5 below are referred to as 'variances' in the original Caffe repo, completely empirical
    # They are for some sort of numerical conditioning, for 'scaling the localization gradient'
    # See https://github.com/weiliu89/caffe/issues/155
    return torch.cat([(cxcy[:, :2] - priors_cxcy[:, :2]) / (priors_cxcy[:, 2:] / 10),  # g_c_x, g_c_y
                      torch.log(cxcy[:, 2:] / priors_cxcy[:, 2:]) * 5], 1)  # g_w, g_h


def gcxgcy_to_cxcy(gcxgcy, priors_cxcy):
    """
    Decode bounding box coordinates predicted by the model, since they are encoded in the form mentioned above.

    They are decoded into center-size coordinates.

    This is the inverse of the function above.

    :param gcxgcy: encoded bounding boxes, i.e. output of the model, a tensor of size (n_priors, 4)
    :param priors_cxcy: prior boxes with respect to which the encoding is defined, a tensor of size (n_priors, 4)
    :return: decoded bounding boxes in center-size form, a tensor of size (n_priors, 4)
    """

    return torch.cat([gcxgcy[:, :2] * priors_cxcy[:, 2:] / 10 + priors_cxcy[:, :2],  # c_x, c_y
                      torch.exp(gcxgcy[:, 2:] / 5) * priors_cxcy[:, 2:]], 1)  # w, h


In [39]:
def parse_annotation(image_file_name, data):
    """
    Extract bounding boxes and labels for a given image file name from JSON annotations.

    :param image_file_name: Path or name of the image file (e.g., "/path/to/img_2355_0_640.jpg").
    :param data: Dictionary containing both 'images' and 'annotations'.
    :return: Dictionary with 'boxes' and 'labels' for the given image file name.
    """
    # Step 1: Extract the base name of the file (remove path)
    base_name = os.path.basename(image_file_name)

    # Step 2: Convert the base name to corresponding image ID
    # Remove "img_" and "_" to create a numeric string, then convert to integer
    try:
        image_id = int(base_name.replace("img_", "").replace("_", "").replace(".jpg", ""))
    except ValueError:
        raise ValueError(f"Invalid image file name format: {image_file_name}")

    #print(image_id)

    # Step 3: Find the corresponding image entry in 'images'
    image_entry = None
    for image in data.get("images", []):
        if image["id"] == image_id:
            image_entry = image
            print(image)
            break

    #print(image_entry)

    if not image_entry:
        return {'boxes': [], 'labels': []}  # No matching image found

    # Step 4: Find all annotations linked to the image ID
    annotations = [
        ann for ann in data.get("annotations", [])
        if ann["image_id"] == image_id
    ]

    print(annotations)

    if not annotations:
        return {'boxes': [], 'labels': []}  # No annotations for this image

    # Step 5: Extract bounding boxes and category labels
    boxes = [eval(ann["bbox"]) for ann in annotations]  # Use eval to convert string to list
    labels = [ann["category_id"] for ann in annotations]

    return {'boxes': boxes, 'labels': labels}


In [40]:
create_data_lists(img_dict, annotations, train_img_path, val_img_path, test_img_path, output_folder)

TypeError: string indices must be integers

### per la gestione dell'interazione tra più bounding boxes

In [None]:
def find_intersection(set_1, set_2):
    """
    Find the intersection of every box combination between two sets of boxes that are in boundary coordinates.

    :param set_1: set 1, a tensor of dimensions (n1, 4)
    :param set_2: set 2, a tensor of dimensions (n2, 4)
    :return: intersection of each of the boxes in set 1 with respect to each of the boxes in set 2, a tensor of dimensions (n1, n2)
    """

    # PyTorch auto-broadcasts singleton dimensions
    lower_bounds = torch.max(set_1[:, :2].unsqueeze(1), set_2[:, :2].unsqueeze(0))  # (n1, n2, 2)
    upper_bounds = torch.min(set_1[:, 2:].unsqueeze(1), set_2[:, 2:].unsqueeze(0))  # (n1, n2, 2)
    intersection_dims = torch.clamp(upper_bounds - lower_bounds, min=0)  # (n1, n2, 2)
    return intersection_dims[:, :, 0] * intersection_dims[:, :, 1]  # (n1, n2)


def find_jaccard_overlap(set_1, set_2):
    """
    Find the Jaccard Overlap (IoU) of every box combination between two sets of boxes that are in boundary coordinates.

    :param set_1: set 1, a tensor of dimensions (n1, 4)
    :param set_2: set 2, a tensor of dimensions (n2, 4)
    :return: Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2, a tensor of dimensions (n1, n2)
    """

    # Find intersections
    intersection = find_intersection(set_1, set_2)  # (n1, n2)

    # Find areas of each box in both sets
    areas_set_1 = (set_1[:, 2] - set_1[:, 0]) * (set_1[:, 3] - set_1[:, 1])  # (n1)
    areas_set_2 = (set_2[:, 2] - set_2[:, 0]) * (set_2[:, 3] - set_2[:, 1])  # (n2)

    # Find the union
    # PyTorch auto-broadcasts singleton dimensions
    union = areas_set_1.unsqueeze(1) + areas_set_2.unsqueeze(0) - intersection  # (n1, n2)

    return intersection / union  # (n1, n2)


## Dataloader

In [None]:
class CustomDataset(Dataset): # da modificare in base al dataset
    """
    A PyTorch Dataset class to be used in a PyTorch DataLoader to create batches.
    """

    def __init__(self, path_image, path_bbox, aug=False):
        """
        :param data_folder: folder where data files are stored
        :param split: split, one of 'TRAIN' or 'TEST'
        :param keep_difficult: keep or discard objects that are considered difficult to detect?
        """

        self.data_folder = data_folder
        self.aug = aug # è presente ma in realtà non serve perchè applico lo stesso tipo di trasformazione per tutti i set di dati

        # Read data files
        with open(path_image, 'r') as j:
            self.images = json.load(j)
        with open(path_bbox, 'r') as j:
            self.objects = json.load(j)

        assert len(self.images) == len(self.objects)

    def __getitem__(self, i):
        # Read image
        image = Image.open(self.images[i], mode='r') 
        image = image.convert('RGB')

        # Read objects in this image (bounding boxes, labels, difficulties)
        objects = self.objects[i] # dal file bbox rixhavo i bounding box presenti nell'immagine con le relative label
        boxes = torch.FloatTensor(objects['boxes'])  # (n_objects, 4)
        labels = torch.LongTensor(objects['labels'])  # (n_objects)

        boxes = boxes[1 - difficulties]
        labels = labels[1 - difficulties]

        # Apply transformations
        image, boxes, labels = transform(image, boxes, labels)

        return image, boxes, labels

    def __len__(self):
        return len(self.images)


In [None]:
def collate_fn(self, batch):
        """
        Since each image may have a different number of objects, we need a collate function (to be passed to the DataLoader).

        This describes how to combine these tensors of different sizes. We use lists.

        Note: this need not be defined in this Class, can be standalone.

        :param batch: an iterable of N sets from __getitem__()
        :return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties
        """

        images = list()
        boxes = list()
        labels = list()

        for b in batch:
            images.append(b[0])
            boxes.append(b[1])
            labels.append(b[2])

        images = torch.stack(images, dim=0)

        return images, boxes, labels  # tensor (N, 3, 300, 300), 3 lists of N tensors each

In [None]:
train_dataset = CustomDataset(train_image, train_bbox)
val_dataset = CustomDataset(val_image, val_bbox)
test_dataset = CustomDataset(test_image, test_bbox)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                                               collate_fn=train_dataset.collate_fn, num_workers=num_workers, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                                             collate_fn=train_dataset.collate_fn, num_workers=num_workers, pin_memory=True)
test_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                                              collate_fn=train_dataset.collate_fn, num_workers=num_workers, pin_memory=True)


## Model

fare l'import da GitHub https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection/blob/master/model.py

## Training

def adjust_learning_rate(optimizer, scale):
    """
    Scale learning rate by a specified factor.

    :param optimizer: optimizer whose learning rate must be shrunk.
    :param scale: factor to multiply learning rate with.
    """
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * scale
    print("DECAYING learning rate.\n The new LR is %f\n" % (optimizer.param_groups[1]['lr'],))


def accuracy(scores, targets, k):
    """
    Computes top-k accuracy, from predicted and true labels.

    :param scores: scores from the model
    :param targets: true labels
    :param k: k in top-k accuracy
    :return: top-k accuracy
    """
    batch_size = targets.size(0)
    _, ind = scores.topk(k, 1, True, True)
    correct = ind.eq(targets.view(-1, 1).expand_as(ind))
    correct_total = correct.view(-1).float().sum()  # 0D tensor
    return correct_total.item() * (100.0 / batch_size)


def save_checkpoint(epoch, model, optimizer):
    """
    Save model checkpoint.

    :param epoch: epoch number
    :param model: model
    :param optimizer: optimizer
    """
    state = {'epoch': epoch,
             'model': model,
             'optimizer': optimizer}
    filename = 'checkpoint_ssd300.pth.tar'
    torch.save(state, filename)

def clip_gradient(optimizer, grad_clip):
    """
    Clips gradients computed during backpropagation to avoid explosion of gradients.

    :param optimizer: optimizer with the gradients to be clipped
    :param grad_clip: clip value
    """
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)

In [None]:
class Trainer:
    def __init__(self, model, train_dataset, train_dataloader, criterion, optimizer, batch_size, num_workers, device, 
                 grad_clip=None, print_freq=10, iterations=120000, decay_lr_at=None, decay_lr_to=0.1, 
                 momentum=0.9, weight_decay=5e-4):
        """
        Initialize the Trainer.
        
        :param model: SSD300 model instance
        :param train_dataset: Dataset object
        :param criterion: Loss function
        :param optimizer: Optimizer
        :param batch_size: Training batch size
        :param num_workers: Number of data loading workers
        :param device: Device to use for training ('cuda' or 'cpu')
        :param grad_clip: Gradient clipping value (default: None)
        :param print_freq: Frequency of printing training progress
        :param iterations: Total number of training iterations
        :param decay_lr_at: Iterations to decay learning rate
        :param decay_lr_to: Learning rate decay factor
        :param momentum: Momentum for optimizer
        :param weight_decay: Weight decay for optimizer
        """
        self.model = model
        self.train_dataset = train_dataset
        self.criterion = criterion
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.device = device
        self.grad_clip = grad_clip
        self.print_freq = print_freq
        self.iterations = iterations
        self.decay_lr_at = decay_lr_at if decay_lr_at is not None else [80000, 100000]
        self.decay_lr_to = decay_lr_to
        self.momentum = momentum
        self.weight_decay = weight_decay

        # Prepare dataloader
        self.train_loader = train_dataloader

        # Calculate epochs and decay epochs
        self.epochs = iterations // (len(train_dataset) // 32)
        self.decay_epochs = [it // (len(train_dataset) // 32) for it in self.decay_lr_at]

    def adjust_learning_rate(self, epoch):
        """
        Adjust the learning rate at specific epochs.
        """
        if epoch in self.decay_epochs:
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = param_group['lr'] * self.decay_lr_to
            print(f"Learning rate adjusted to {param_group['lr']} at epoch {epoch}")

    def train_one_epoch(self, epoch):
        """
        Perform one epoch of training.
        """
        self.model.train()
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()

        start = time.time()

        for i, (images, boxes, labels, _) in enumerate(self.train_loader):
            data_time.update(time.time() - start)

            # Move to device
            images = images.to(self.device)
            boxes = [b.to(self.device) for b in boxes]
            labels = [l.to(self.device) for l in labels]

            # Forward pass
            predicted_locs, predicted_scores = self.model(images)

            # Compute loss
            loss = self.criterion(predicted_locs, predicted_scores, boxes, labels)

            # Backward pass
            self.optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            if self.grad_clip is not None:
                clip_gradient(self.optimizer, self.grad_clip)

            # Update model parameters
            self.optimizer.step()

            # Update metrics
            losses.update(loss.item(), images.size(0))
            batch_time.update(time.time() - start)

            start = time.time()

            # Print status
            if i % self.print_freq == 0:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(self.train_loader),
                                                                      batch_time=batch_time,
                                                                      data_time=data_time,
                                                                      loss=losses))

        del predicted_locs, predicted_scores, images, boxes, labels

    def save_checkpoint(self, epoch):
        """
        Save model checkpoint.
        """
        torch.save({
            'epoch': epoch,
            'model': self.model,
            'optimizer': self.optimizer,
        }, f'checkpoint_epoch_{epoch}.pth')
        print(f"Checkpoint saved for epoch {epoch}.")

    def train(self, start_epoch=0):
        """
        Train the model across all epochs.
        """
        for epoch in range(start_epoch, self.epochs):
            self.adjust_learning_rate(epoch)
            self.train_one_epoch(epoch)
            self.save_checkpoint(epoch)


In [None]:
# Model parameters
n_classes = len(label_map())  # number of different types of objects

# Learning parameters
checkpoint = None  # path to model checkpoint, None if none
batch_size = 8  # batch size
iterations = 10  # number of iterations to train
workers = 4  # number of workers for loading data in the DataLoader
print_freq = 200  # print training status every __ batches
lr = 1e-3  # learning rate
decay_lr_at = [80000, 100000]  # decay learning rate after these many iterations
decay_lr_to = 0.1  # decay learning rate to this fraction of the existing learning rate
momentum = 0.9  # momentum
weight_decay = 5e-4  # weight decay
grad_clip = None  # clip if gradients are exploding, which may happen at larger batch sizes (sometimes at 32) - you will recognize it by a sorting error in the MuliBox loss calculation


In [None]:
criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)

# Ottimizzatore
biases = [param for name, param in model.named_parameters() if param.requires_grad and name.endswith('.bias')]
not_biases = [param for name, param in model.named_parameters() if param.requires_grad and not name.endswith('.bias')]
optimizer = torch.optim.SGD(params=[{'params': biases, 'lr': 2 * lr}, {'params': not_biases}],
                            lr=lr, momentum=momentum, weight_decay=weight_decay)


In [None]:
# Creazione e avvio del trainer
trainer = Trainer(model, train_dataset, criterion, optimizer, batch_size, workers, device, grad_clip=grad_clip)

trainer.train()

## Testing sulle predizioni

In [None]:
class Evaluator:
    def __init__(self, model, test_dataset, batch_size, num_workers, device):
        """
        Initialize the Evaluator.
        
        :param model: Trained SSD model to be evaluated
        :param test_dataset: Dataset object for testing
        :param batch_size: Batch size for evaluation
        :param num_workers: Number of data loading workers
        :param device: Device to use for evaluation ('cuda' or 'cpu')
        """
        self.model = model.to(device)
        self.test_dataset = test_dataset
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.device = device
        self.pp = PrettyPrinter()  # For printing APs nicely

        # Prepare dataloader
        self.test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=test_dataset.collate_fn,
            num_workers=num_workers,
            pin_memory=True
        )

    def evaluate(self):
        """
        Perform evaluation and compute mAP.
        """
        self.model.eval()

        # Lists to store detected and true boxes, labels, scores
        det_boxes = list()
        det_labels = list()
        det_scores = list()
        true_boxes = list()
        true_labels = list()
        true_difficulties = list()

        with torch.no_grad():
            for i, (images, boxes, labels, difficulties) in enumerate(tqdm(self.test_loader, desc='Evaluating')):
                images = images.to(self.device)

                # Forward pass
                predicted_locs, predicted_scores = self.model(images)

                # Detect objects
                det_boxes_batch, det_labels_batch, det_scores_batch = self.model.detect_objects(
                    predicted_locs, predicted_scores,
                    min_score=0.01, max_overlap=0.45, top_k=200
                )

                # Store this batch's results
                boxes = [b.to(self.device) for b in boxes]
                labels = [l.to(self.device) for l in labels]
                difficulties = [d.to(self.device) for d in difficulties]

                det_boxes.extend(det_boxes_batch)
                det_labels.extend(det_labels_batch)
                det_scores.extend(det_scores_batch)
                true_boxes.extend(boxes)
                true_labels.extend(labels)
                true_difficulties.extend(difficulties)

        # Calculate mAP
        APs, mAP = self.calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties)

        # Print AP for each class
        self.pp.pprint(APs)
        print('\nMean Average Precision (mAP): %.3f' % mAP)

    @staticmethod
    def calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties):
        """
        Calculate Mean Average Precision (mAP).
        Placeholder for an actual implementation.
        
        :param det_boxes: Detected boxes
        :param det_labels: Detected labels
        :param det_scores: Detected scores
        :param true_boxes: Ground truth boxes
        :param true_labels: Ground truth labels
        :param true_difficulties: Ground truth difficulties
        :return: APs and mAP
        """
        # Replace this with your actual mAP calculation logic
        APs = {f'class_{i}': 0.0 for i in range(1, 21)}  # Dummy values for each class
        mAP = 0.0  # Dummy value for mAP
        return APs, mAP


In [None]:
# Caricamento del modello
checkpoint = torch.load(checkpoint_path)
model = checkpoint['model']


# Creazione e avvio del valutatore
evaluator = Evaluator(model=model, test_dataset=test_dataset, batch_size=64, num_workers=4, device=device)
evaluator.evaluate()


In [None]:
def calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties):
    """
    Calculate the Mean Average Precision (mAP) of detected objects.

    See https://medium.com/@jonathan_hui/map-mean-average-precision-for-object-detection-45c121a31173 for an explanation

    :param det_boxes: list of tensors, one tensor for each image containing detected objects' bounding boxes
    :param det_labels: list of tensors, one tensor for each image containing detected objects' labels
    :param det_scores: list of tensors, one tensor for each image containing detected objects' labels' scores
    :param true_boxes: list of tensors, one tensor for each image containing actual objects' bounding boxes
    :param true_labels: list of tensors, one tensor for each image containing actual objects' labels
    :param true_difficulties: list of tensors, one tensor for each image containing actual objects' difficulty (0 or 1)
    :return: list of average precisions for all classes, mean average precision (mAP)
    """
    assert len(det_boxes) == len(det_labels) == len(det_scores) == len(true_boxes) == len(true_labels)  # these are all lists of tensors of the same length, i.e. number of images
    n_classes = len(label_map)

    # Store all (true) objects in a single continuous tensor while keeping track of the image it is from
    true_images = list()
    for i in range(len(true_labels)):
        true_images.extend([i] * true_labels[i].size(0))
    true_images = torch.LongTensor(true_images).to(
        device)  # (n_objects), n_objects is the total no. of objects across all images
    true_boxes = torch.cat(true_boxes, dim=0)  # (n_objects, 4)
    true_labels = torch.cat(true_labels, dim=0)  # (n_objects)

    assert true_images.size(0) == true_boxes.size(0) == true_labels.size(0)

    # Store all detections in a single continuous tensor while keeping track of the image it is from
    det_images = list()
    for i in range(len(det_labels)):
        det_images.extend([i] * det_labels[i].size(0))
    det_images = torch.LongTensor(det_images).to(device)  # (n_detections)
    det_boxes = torch.cat(det_boxes, dim=0)  # (n_detections, 4)
    det_labels = torch.cat(det_labels, dim=0)  # (n_detections)
    det_scores = torch.cat(det_scores, dim=0)  # (n_detections)

    assert det_images.size(0) == det_boxes.size(0) == det_labels.size(0) == det_scores.size(0)

    # Calculate APs for each class (except background)
    average_precisions = torch.zeros((n_classes - 1), dtype=torch.float)  # (n_classes - 1)
    for c in range(1, n_classes):
        # Extract only objects with this class
        true_class_images = true_images[true_labels == c]  # (n_class_objects)
        true_class_boxes = true_boxes[true_labels == c]  # (n_class_objects, 4)
        n_easy_class_objects = (1 - true_class_difficulties).sum().item()  # ignore difficult objects

        # Keep track of which true objects with this class have already been 'detected'
        # So far, none
        true_class_boxes_detected = torch.zeros((true_class_difficulties.size(0)), dtype=torch.uint8).to(
            device)  # (n_class_objects)

        # Extract only detections with this class
        det_class_images = det_images[det_labels == c]  # (n_class_detections)
        det_class_boxes = det_boxes[det_labels == c]  # (n_class_detections, 4)
        det_class_scores = det_scores[det_labels == c]  # (n_class_detections)
        n_class_detections = det_class_boxes.size(0)
        if n_class_detections == 0:
            continue

        # Sort detections in decreasing order of confidence/scores
        det_class_scores, sort_ind = torch.sort(det_class_scores, dim=0, descending=True)  # (n_class_detections)
        det_class_images = det_class_images[sort_ind]  # (n_class_detections)
        det_class_boxes = det_class_boxes[sort_ind]  # (n_class_detections, 4)

        # In the order of decreasing scores, check if true or false positive
        true_positives = torch.zeros((n_class_detections), dtype=torch.float).to(device)  # (n_class_detections)
        false_positives = torch.zeros((n_class_detections), dtype=torch.float).to(device)  # (n_class_detections)
        for d in range(n_class_detections):
            this_detection_box = det_class_boxes[d].unsqueeze(0)  # (1, 4)
            this_image = det_class_images[d]  # (), scalar

            # Find objects in the same image with this class, their difficulties, and whether they have been detected before
            object_boxes = true_class_boxes[true_class_images == this_image]  # (n_class_objects_in_img)
            # If no such object in this image, then the detection is a false positive
            if object_boxes.size(0) == 0:
                false_positives[d] = 1
                continue

            # Find maximum overlap of this detection with objects in this image of this class
            overlaps = find_jaccard_overlap(this_detection_box, object_boxes)  # (1, n_class_objects_in_img)
            max_overlap, ind = torch.max(overlaps.squeeze(0), dim=0)  # (), () - scalars

            # 'ind' is the index of the object in these image-level tensors 'object_boxes', 'object_difficulties'
            # In the original class-level tensors 'true_class_boxes', etc., 'ind' corresponds to object with index...
            original_ind = torch.LongTensor(range(true_class_boxes.size(0)))[true_class_images == this_image][ind]
            # We need 'original_ind' to update 'true_class_boxes_detected'

            # If the maximum overlap is greater than the threshold of 0.5, it's a match
            if max_overlap.item() > 0.5:
                # If this object has already not been detected, it's a true positive
                if true_class_boxes_detected[original_ind] == 0:
                    true_positives[d] = 1
                    true_class_boxes_detected[original_ind] = 1  # this object has now been detected/accounted for
                # Otherwise, it's a false positive (since this object is already accounted for)
                else:
                    false_positives[d] = 1
            # Otherwise, the detection occurs in a different location than the actual object, and is a false positive
            else:
                false_positives[d] = 1

        # Compute cumulative precision and recall at each detection in the order of decreasing scores
        cumul_true_positives = torch.cumsum(true_positives, dim=0)  # (n_class_detections)
        cumul_false_positives = torch.cumsum(false_positives, dim=0)  # (n_class_detections)
        cumul_precision = cumul_true_positives / (
                cumul_true_positives + cumul_false_positives + 1e-10)  # (n_class_detections)
        cumul_recall = cumul_true_positives / n_easy_class_objects  # (n_class_detections)

        # Find the mean of the maximum of the precisions corresponding to recalls above the threshold 't'
        recall_thresholds = torch.arange(start=0, end=1.1, step=.1).tolist()  # (11)
        precisions = torch.zeros((len(recall_thresholds)), dtype=torch.float).to(device)  # (11)
        for i, t in enumerate(recall_thresholds):
            recalls_above_t = cumul_recall >= t
            if recalls_above_t.any():
                precisions[i] = cumul_precision[recalls_above_t].max()
            else:
                precisions[i] = 0.
        average_precisions[c - 1] = precisions.mean()  # c is in [1, n_classes - 1]

    # Calculate Mean Average Precision (mAP)
    mean_average_precision = average_precisions.mean().item()

    # Keep class-wise average precisions in a dictionary
    average_precisions = {rev_label_map[c + 1]: v for c, v in enumerate(average_precisions.tolist())}

    return average_precisions, mean_average_precision
