In [None]:
# https://towardsdatascience.com/optimize-pytorch-performance-for-speed-and-memory-efficiency-2022-84f453916ea6

In [1]:
import sys
sys.path.append("../")

from src.detection.engine import train_one_epoch, evaluate

In [2]:
import os
import pandas as pd
import numpy as np
import torch
from PIL import Image

import torchvision
from torchvision import transforms as T

In [3]:
# train_df = pd.read_csv("/home/nacho/TFI-Cazcarra/data/csv/augmented_train_diagramas.csv", 
#                        header=None)
# train_df.columns = ['image_path', 'xmin', 'ymin', 'xmax', 'ymax', 'label']

# test_df = pd.read_csv("/home/nacho/TFI-Cazcarra/data/csv/augmented_test_diagramas.csv")

train_df = pd.read_csv(f"/home/nacho/TFI-Cazcarra/data/tiles/train_cardinalidades_linux.csv")
test_df = pd.read_csv(f"/home/nacho/TFI-Cazcarra/data/tiles/test_cardinalidades_linux.csv")

In [4]:
ELEMENT_TO_TRAIN = "diagramas"
PATH = "/home/nacho/TFI-Cazcarra"

CLASSES_CSV = f"{PATH}/data/csv/classes_{ELEMENT_TO_TRAIN}.csv"
# IMAGES_DIR = f"{PATH}/data/imagenes_diagramas"
IMAGES_DIR = f"{PATH}/data/tiles/image_slices/"

In [5]:
from sklearn.preprocessing import LabelEncoder

classes = pd.read_csv(CLASSES_CSV)
le = LabelEncoder()
le.fit(classes.nombre)

le_num_arr = le.transform(classes.nombre) + 1 #Empezamos por 1 para dejarle el 0 a la 'background class'
le_label_arr = classes.nombre.values

le_dict = {k:v for k,v in zip(le_label_arr,le_num_arr)}
le_dict

{'tabla': 3,
 'muchos_opcional': 2,
 'muchos_obligatorio': 1,
 'uno_opcional': 5,
 'uno_obligatorio': 4}

In [6]:
train_df['label_transformed'] = train_df['label'].apply(lambda x: le_dict[x])
test_df['label_transformed'] = test_df['label'].apply(lambda x: le_dict[x])

In [7]:
class PennFudanDataset(object):
    def __init__(self, csv, images_dir, transforms):
        self.csv = csv
        self.transforms = transforms
        self.images_dir = images_dir
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = [i for i in os.listdir(self.images_dir) \
                       if os.path.join(self.images_dir, i) in self.csv['image_path'].unique()]

    def __getitem__(self, idx):
        # load images
        img_path = self.imgs[idx]
        full_image_path = os.path.join(self.images_dir, img_path)
        img = Image.open(full_image_path).convert("RGB")
        filtered_df = self.csv[self.csv['image_path']==full_image_path]
        # get bounding box coordinates
        num_objs = len(filtered_df)
        boxes = []
        for xmin, ymin, xmax, ymax in zip(filtered_df['xmin'], filtered_df['ymin'], filtered_df['xmax'], filtered_df['ymax']):
            boxes.append([xmin, ymin, xmax, ymax])       
        boxes = torch.as_tensor(boxes, dtype=torch.float32)

        labels = torch.tensor(list(filtered_df['label_transformed'].values), dtype=torch.int64)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [8]:
def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=True, min_size=600)
    return model

In [9]:
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
        transforms.append(T.RandomVerticalFlip(0.5))
    return T.Compose(transforms)

In [10]:
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

num_classes = len(le_dict)+1 

dataset = PennFudanDataset(csv=train_df, images_dir=IMAGES_DIR, 
                           transforms=get_transform(train=True))
dataset_test = PennFudanDataset(csv=test_df, images_dir=IMAGES_DIR, 
                                transforms=get_transform(train=False))

In [11]:
def collate_fn(batch):
    return tuple(zip(*batch))

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=2, shuffle=True, num_workers=0,
    collate_fn=collate_fn)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1, shuffle=False, num_workers=0,
    collate_fn=collate_fn)

In [12]:
class SaveBestModel:
    """
    Class to save the best model while training. If the current epoch's 
    loss is less than the previous least less, then save the
    model state.
    """
    def __init__(
        self, best_loss=float('inf')
    ):
        self.best_loss = best_loss
        
    def __call__(
        self, current_loss, 
        epoch, model, optimizer
    ):
        if current_loss < self.best_loss:
            self.best_loss = round(current_loss,3)
            print(f"\nBest loss: {self.best_loss}")
            print(f"Saving best model for epoch: {epoch+1}\n")
            
            model_name = model.__class__.__name__.lower()
            PATH_TO_SAVE_MODEL = f"/home/nacho/TFI-Cazcarra/data/models/best_model_{model_name}.pt"
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'loss': self.best_loss,
                }, PATH_TO_SAVE_MODEL)

In [13]:
# get the model using our helper function
model = get_model_instance_segmentation(num_classes)
# move model to the right device
model.to(device)



RetinaNet(
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(256, eps=0.0)


In [14]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
params

33792599

## Training model

In [15]:
save_best_model = SaveBestModel()

In [16]:
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.0025,#0.005,
                            momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=5,
                                               gamma=0.1)

In [17]:
num_epochs = 30

for epoch in range(num_epochs):
    metric_logger, loss_value = train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=5)
    lr_scheduler.step()
    evaluate(model, data_loader_test, device=device)
    save_best_model(loss_value, epoch, model, optimizer)
    del metric_logger

Epoch: [0]  [ 0/69]  eta: 0:09:27  lr: 0.000039  loss: 3.1070 (3.1070)  classification: 1.9340 (1.9340)  bbox_regression: 1.1730 (1.1730)  time: 8.2240  data: 0.0518
Epoch: [0]  [ 5/69]  eta: 0:09:05  lr: 0.000223  loss: 3.2222 (3.3053)  classification: 2.0264 (2.0578)  bbox_regression: 1.1730 (1.2475)  time: 8.5217  data: 0.0261
Epoch: [0]  [10/69]  eta: 0:08:10  lr: 0.000407  loss: 2.7526 (2.8870)  classification: 1.7343 (1.7700)  bbox_regression: 1.0256 (1.1170)  time: 8.3093  data: 0.0324
Epoch: [0]  [15/69]  eta: 0:07:38  lr: 0.000590  loss: 2.6539 (2.6685)  classification: 1.5392 (1.6355)  bbox_regression: 0.9240 (1.0329)  time: 8.4828  data: 0.0329
Epoch: [0]  [20/69]  eta: 0:07:02  lr: 0.000774  loss: 2.2419 (2.4850)  classification: 1.1440 (1.4932)  bbox_regression: 0.9164 (0.9918)  time: 8.6415  data: 0.0316
Epoch: [0]  [25/69]  eta: 0:06:32  lr: 0.000957  loss: 1.9369 (2.3832)  classification: 1.0802 (1.4066)  bbox_regression: 0.8596 (0.9765)  time: 9.0529  data: 0.0319
Epoc

Epoch: [2]  [10/69]  eta: 0:08:54  lr: 0.002500  loss: 1.6926 (1.7498)  classification: 1.0711 (1.0606)  bbox_regression: 0.7388 (0.6892)  time: 9.0671  data: 0.0277
Epoch: [2]  [15/69]  eta: 0:08:17  lr: 0.002500  loss: 1.6780 (1.7380)  classification: 1.0711 (1.0803)  bbox_regression: 0.5853 (0.6577)  time: 9.2158  data: 0.0272
Epoch: [2]  [20/69]  eta: 0:07:29  lr: 0.002500  loss: 1.6440 (1.6822)  classification: 1.0615 (1.0457)  bbox_regression: 0.5579 (0.6365)  time: 9.2392  data: 0.0253
Epoch: [2]  [25/69]  eta: 0:06:43  lr: 0.002500  loss: 1.5834 (1.6386)  classification: 1.0446 (1.0176)  bbox_regression: 0.5558 (0.6210)  time: 9.2703  data: 0.0238
Epoch: [2]  [30/69]  eta: 0:05:52  lr: 0.002500  loss: 1.5798 (1.6253)  classification: 0.9797 (0.9872)  bbox_regression: 0.5853 (0.6381)  time: 9.0370  data: 0.0255
Epoch: [2]  [35/69]  eta: 0:05:06  lr: 0.002500  loss: 1.4585 (1.6045)  classification: 0.8991 (0.9779)  bbox_regression: 0.5580 (0.6266)  time: 8.8688  data: 0.0275
Epoc

Epoch: [4]  [20/69]  eta: 0:07:07  lr: 0.002500  loss: 1.4659 (1.4908)  classification: 0.9008 (0.8874)  bbox_regression: 0.5629 (0.6034)  time: 8.7749  data: 0.0298
Epoch: [4]  [25/69]  eta: 0:06:30  lr: 0.002500  loss: 1.4497 (1.4728)  classification: 0.9008 (0.8775)  bbox_regression: 0.5629 (0.5953)  time: 9.0744  data: 0.0258
Epoch: [4]  [30/69]  eta: 0:05:49  lr: 0.002500  loss: 1.5000 (1.4678)  classification: 0.9039 (0.8715)  bbox_regression: 0.5489 (0.5963)  time: 9.0968  data: 0.0266
Epoch: [4]  [35/69]  eta: 0:05:02  lr: 0.002500  loss: 1.3983 (1.4532)  classification: 0.8723 (0.8665)  bbox_regression: 0.4922 (0.5868)  time: 9.0381  data: 0.0276
Epoch: [4]  [40/69]  eta: 0:04:21  lr: 0.002500  loss: 1.3983 (1.4266)  classification: 0.8060 (0.8471)  bbox_regression: 0.5457 (0.5795)  time: 9.3353  data: 0.0285
Epoch: [4]  [45/69]  eta: 0:03:34  lr: 0.002500  loss: 1.4571 (1.4394)  classification: 0.8189 (0.8518)  bbox_regression: 0.5529 (0.5876)  time: 9.0732  data: 0.0293
Epoc

Epoch: [6]  [30/69]  eta: 0:05:55  lr: 0.000250  loss: 1.0979 (1.2439)  classification: 0.7380 (0.7633)  bbox_regression: 0.4141 (0.4806)  time: 9.5369  data: 0.0250
Epoch: [6]  [35/69]  eta: 0:05:13  lr: 0.000250  loss: 1.0979 (1.2626)  classification: 0.7380 (0.7711)  bbox_regression: 0.4155 (0.4915)  time: 9.2694  data: 0.0291
Epoch: [6]  [40/69]  eta: 0:04:24  lr: 0.000250  loss: 1.0184 (1.2521)  classification: 0.7435 (0.7620)  bbox_regression: 0.4068 (0.4901)  time: 9.1272  data: 0.0302
Epoch: [6]  [45/69]  eta: 0:03:38  lr: 0.000250  loss: 1.0184 (1.2281)  classification: 0.6788 (0.7434)  bbox_regression: 0.3959 (0.4848)  time: 9.3385  data: 0.0337
Epoch: [6]  [50/69]  eta: 0:02:53  lr: 0.000250  loss: 1.2171 (1.2665)  classification: 0.6984 (0.7555)  bbox_regression: 0.4564 (0.5109)  time: 9.1911  data: 0.0331
Epoch: [6]  [55/69]  eta: 0:02:07  lr: 0.000250  loss: 1.3446 (1.2795)  classification: 0.7537 (0.7670)  bbox_regression: 0.4564 (0.5125)  time: 8.9755  data: 0.0290
Epoc

Epoch: [8]  [40/69]  eta: 0:04:11  lr: 0.000250  loss: 1.2606 (1.2692)  classification: 0.6572 (0.7451)  bbox_regression: 0.5031 (0.5241)  time: 8.4039  data: 0.0320
Epoch: [8]  [45/69]  eta: 0:03:32  lr: 0.000250  loss: 1.2606 (1.2719)  classification: 0.7384 (0.7537)  bbox_regression: 0.5136 (0.5181)  time: 9.0983  data: 0.0314
Epoch: [8]  [50/69]  eta: 0:02:47  lr: 0.000250  loss: 1.2603 (1.2659)  classification: 0.7599 (0.7496)  bbox_regression: 0.5316 (0.5162)  time: 8.9435  data: 0.0281
Epoch: [8]  [55/69]  eta: 0:02:03  lr: 0.000250  loss: 1.1682 (1.2569)  classification: 0.7065 (0.7457)  bbox_regression: 0.5136 (0.5112)  time: 8.9378  data: 0.0275
Epoch: [8]  [60/69]  eta: 0:01:20  lr: 0.000250  loss: 1.3865 (1.2763)  classification: 0.7920 (0.7601)  bbox_regression: 0.4761 (0.5162)  time: 9.5004  data: 0.0269
Epoch: [8]  [65/69]  eta: 0:00:35  lr: 0.000250  loss: 1.1934 (1.2644)  classification: 0.7824 (0.7536)  bbox_regression: 0.4761 (0.5108)  time: 9.1890  data: 0.0272
Epoc

Epoch: [10]  [45/69]  eta: 0:03:37  lr: 0.000025  loss: 1.2087 (1.2531)  classification: 0.7389 (0.7625)  bbox_regression: 0.4387 (0.4905)  time: 8.9853  data: 0.0332
Epoch: [10]  [50/69]  eta: 0:02:49  lr: 0.000025  loss: 1.2087 (1.2273)  classification: 0.7336 (0.7450)  bbox_regression: 0.4387 (0.4823)  time: 8.8816  data: 0.0296
Epoch: [10]  [55/69]  eta: 0:02:05  lr: 0.000025  loss: 1.0091 (1.2235)  classification: 0.6428 (0.7436)  bbox_regression: 0.4135 (0.4800)  time: 8.8746  data: 0.0260
Epoch: [10]  [60/69]  eta: 0:01:20  lr: 0.000025  loss: 1.1319 (1.2384)  classification: 0.7229 (0.7524)  bbox_regression: 0.4135 (0.4860)  time: 8.7006  data: 0.0276
Epoch: [10]  [65/69]  eta: 0:00:35  lr: 0.000025  loss: 1.2252 (1.2470)  classification: 0.7533 (0.7511)  bbox_regression: 0.4773 (0.4959)  time: 8.4607  data: 0.0284
Epoch: [10]  [68/69]  eta: 0:00:08  lr: 0.000025  loss: 1.2622 (1.2443)  classification: 0.7745 (0.7532)  bbox_regression: 0.4773 (0.4911)  time: 8.3577  data: 0.029

Epoch: [12]  [55/69]  eta: 0:02:07  lr: 0.000025  loss: 1.3163 (1.2160)  classification: 0.8015 (0.7303)  bbox_regression: 0.4759 (0.4857)  time: 9.5505  data: 0.0315
Epoch: [12]  [60/69]  eta: 0:01:23  lr: 0.000025  loss: 1.2023 (1.2103)  classification: 0.7369 (0.7301)  bbox_regression: 0.4213 (0.4801)  time: 9.8070  data: 0.0292
Epoch: [12]  [65/69]  eta: 0:00:36  lr: 0.000025  loss: 1.2023 (1.2122)  classification: 0.7369 (0.7340)  bbox_regression: 0.4364 (0.4782)  time: 9.1621  data: 0.0320
Epoch: [12]  [68/69]  eta: 0:00:09  lr: 0.000025  loss: 1.3134 (1.2163)  classification: 0.8470 (0.7422)  bbox_regression: 0.4320 (0.4741)  time: 9.1573  data: 0.0332
Epoch: [12] Total time: 0:10:33 (9.1871 s / it)
creating index...
index created!
Test:  [ 0/61]  eta: 0:03:54  model_time: 3.8163 (3.8163)  evaluator_time: 0.0141 (0.0141)  time: 3.8369  data: 0.0064
Test:  [60/61]  eta: 0:00:02  model_time: 2.5203 (2.8272)  evaluator_time: 0.0043 (0.0047)  time: 2.8059  data: 0.0060
Test: Total t

KeyboardInterrupt: 

## Save model
https://pytorch.org/tutorials/beginner/saving_loading_models.html

In [None]:
model_name = model.__class__.__name__.lower()
PATH_TO_SAVE_MODEL = f"/home/nacho/TFI-Cazcarra/data/models/model_{model_name}_final_cardinalidades.pt"
torch.save(model.state_dict(), PATH_TO_SAVE_MODEL)

torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'loss': loss_value,
            }, PATH_TO_SAVE_MODEL)

## Testing

In [None]:
import cv2
import PIL
from IPython.display import display

def get_class_name(num_label, le_dict):
    reversed_le_dict = {v:k for k,v in le_dict.items()}
    return reversed_le_dict[num_label]

def draw_bbox(img, xmin, ymin, xmax, ymax, score, label): 
    txt = get_class_name(label, le_dict) + ' ' + str(score)
    img = cv2.putText(img, txt, (int(xmin), int(ymin)),
                      cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,255), 1)

    return cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), 
                         (255,0,0), 1)

In [None]:
model.eval()

In [None]:
for i in range(len(dataset_test)):
    tensor_image = dataset_test.__getitem__(i)[0]
    to_pil = T.ToPILImage()
    pil_image = to_pil(tensor_image)
    predictions = model([tensor_image])
    image = pil_image
    for prediction in predictions:
        for box, score, label in zip(prediction['boxes'],prediction['scores'],prediction['labels']):
            score = round(score.item(), 3)
            label = label.item()
            if score < 0.5:
                break
            xmin = box[0].item()
            ymin = box[1].item()
            xmax = box[2].item()
            ymax = box[3].item()
            print(xmin, ymin, xmax, ymax)
            if isinstance(image, PIL.Image.Image):
                image = draw_bbox(np.array(image), xmin, ymin, xmax, ymax, score, label)
            else:
                image = draw_bbox(image, xmin, ymin, xmax, ymax, score, label)
        display(Image.fromarray(image)) if not isinstance(image, PIL.Image.Image) else display(image)

In [None]:
for prediction in predictions:
    for box, score, label in zip(prediction['boxes'],prediction['scores'],prediction['labels']):
        print(box, score, label)
    print("\n")

## Load the two final models & calculate AP for them
- https://torchmetrics.readthedocs.io/en/stable/classification/average_precision.html
- https://torchmetrics.readthedocs.io/en/stable/retrieval/map.html

In [None]:
model = get_model_instance_segmentation(len(le_dict)+1)
model_name = model.__class__.__name__.lower()
PATH_TO_LOAD_MODEL = f"/home/nacho/TFI-Cazcarra/data/models/best_model_{model_name}.pt"

model_obj = torch.load(PATH_TO_LOAD_MODEL)
model.load_state_dict(model_obj['model_state_dict'])

In [None]:
model.eval()

In [None]:
from torchmetrics.detection.mean_ap import MeanAveragePrecision

In [None]:
# En batch mata el kernel; hay que intentar hacerlo
test_tensor_images = [dataset_test.__getitem__(i)[0] for i in range(len(dataset_test))]
predictions = model(test_tensor_images)

In [None]:
len(predictions)