In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/UFRGS/CV/TF
%ls -lah

/content/drive/MyDrive/Colab Notebooks/UFRGS/CV/TF
total 20K
drwx------ 2 root root 4.0K Apr 11 00:13 [0m[01;34mboard[0m/
drwx------ 2 root root 4.0K Apr 11 00:08 [01;34mcheckpoints[0m/
drwx------ 2 root root 4.0K Apr  5 11:56 [01;34mdata[0m/
drwx------ 2 root root 4.0K Apr  6 22:25 [01;34mnotebooks[0m/
drwx------ 2 root root 4.0K Apr 10 20:16 [01;34msrc[0m/


In [3]:
import numpy as np

import pathlib as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary

from src.models.unet import UNet
from src.utils.data_loader import BacteriaDataset
from src.utils.plot_utils import plot_data
from src.utils.train_utils import train, validation, test, save_model_with_meta

In [4]:
BATCH_SIZE = 32
RESOLUTION = (512, 512)
CLASS_WEIGHTS = [0.5, 1, 1]
NUM_WORKERS = 2
PATIENCE = 100
START_EPOCH = 0
MAX_EPOCHS = 1000
BEST_EPOCH = 0
BEST_VAL_ACC = -9999
EXTRA_INFO = None
TENSORBOARD_DIR = "board"

WORKING_PATH = "/content/drive/My Drive/Colab Notebooks/UFRGS/CV/TF"
DATA_BASE_PATH = "/".join([WORKING_PATH, "data"])

CHECKPOINT_PATH = "/".join([WORKING_PATH, "checkpoints"])
pl.Path(CHECKPOINT_PATH).mkdir(parents=True, exist_ok=True)
MODEL_PATH = "/".join([CHECKPOINT_PATH, "unet.pth"])

WRITER_PATH = "/".join([WORKING_PATH, TENSORBOARD_DIR])
pl.Path(WRITER_PATH).mkdir(parents=True, exist_ok=True)
WRITER = SummaryWriter(log_dir=WRITER_PATH)

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Training on {DEVICE}")

Training on cpu


In [5]:
train_dataset = BacteriaDataset(base_path="/".join([DATA_BASE_PATH, "train"]), 
                                resolution=RESOLUTION)
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=BATCH_SIZE,
                                           shuffle=True,
                                           num_workers=NUM_WORKERS)
val_dataset = BacteriaDataset(base_path="/".join([DATA_BASE_PATH, "val"]), 
                              resolution=RESOLUTION)
val_loader = torch.utils.data.DataLoader(val_dataset,
                                         batch_size=BATCH_SIZE,
                                         shuffle=False,
                                         num_workers=NUM_WORKERS)
test_dataset = BacteriaDataset(base_path="/".join([DATA_BASE_PATH, "test"]), 
                               resolution=RESOLUTION)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=BATCH_SIZE,
                                          shuffle=False,
                                          num_workers=NUM_WORKERS)

In [6]:
# Model
print('==> Building model..')
num_classes = train_dataset.num_classes
model = UNet(in_channels=3,
                out_channels=num_classes,
                n_blocks=4,
                start_filters=1,
                activation='relu',
                normalization='batch',
                conv_mode='same',
                dim=2)
model = model.to(DEVICE)
print(model)
print('==> Done!')

==> Building model..
{'UNet': {'in_channels': 3, 'out_channels': 3, 'n_blocks': 4, 'start_filters': 1, 'activation': 'relu', 'normalization': 'batch', 'conv_mode': 'same', 'dim': 2, 'up_mode': 'transposed'}}
==> Done!


In [7]:
weights = torch.from_numpy(np.array(CLASS_WEIGHTS, dtype=np.float32)).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=0.01)
# criterion = nn.functional.nll_loss
criterion = nn.CrossEntropyLoss(ignore_index=-1, weight=weights)
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                    mode='max',
                                                    patience=20,
                                                    verbose=True)

In [8]:
# Checking shapes
sample_batch = next(iter(train_loader))
images = sample_batch['image'].to(DEVICE)
masks = sample_batch['mask'].to(DEVICE)
with torch.no_grad():
    outputs = model(images)
print(images.shape)
print(masks.shape)
print(outputs.shape)
targets = torch.argmax(masks, dim=1)
preds = torch.nn.functional.log_softmax(outputs, dim=1)
print(preds.shape)
print(targets.shape)

plot_data(sample_batch, train_dataset)

Output hidden; open in https://colab.research.google.com to view.

In [9]:
# torch summary
summary = summary(model, (3, 512, 512))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 1, 512, 512]              28
              ReLU-2          [-1, 1, 512, 512]               0
       BatchNorm2d-3          [-1, 1, 512, 512]               2
            Conv2d-4          [-1, 1, 512, 512]              10
              ReLU-5          [-1, 1, 512, 512]               0
       BatchNorm2d-6          [-1, 1, 512, 512]               2
         MaxPool2d-7          [-1, 1, 256, 256]               0
         DownBlock-8  [[-1, 1, 256, 256], [-1, 1, 512, 512]]               0
            Conv2d-9          [-1, 2, 256, 256]              20
             ReLU-10          [-1, 2, 256, 256]               0
      BatchNorm2d-11          [-1, 2, 256, 256]               4
           Conv2d-12          [-1, 2, 256, 256]              38
             ReLU-13          [-1, 2, 256, 256]               0
      BatchNorm2d-14      

In [None]:
string = "# ================================================================== # \n" \
         "#                         Starting Training!                         # \n" \
         "# ================================================================== #"
print(string)

for epoch in range(START_EPOCH, MAX_EPOCHS):
    train_acc, train_loss = train(epoch=epoch,
                                  model=model,
                                  criterion=criterion,
                                  optimizer=optimizer,
                                  data_loader=train_loader,
                                  device=DEVICE,
                                  writer=WRITER)

    val_acc, val_loss = validation(epoch=epoch,
                                   model=model,
                                   criterion=criterion,
                                   data_loader=val_loader,
                                   device=DEVICE,
                                   lr_scheduler=lr_scheduler,
                                   writer=WRITER)

    test_acc, test_loss = test(epoch=epoch,
                               model=model,
                               criterion=criterion,
                               data_loader=test_loader,
                               device=DEVICE,
                               writer=WRITER)

    if epoch >= 10 and val_acc > BEST_VAL_ACC:
        BEST_VAL_ACC = val_acc
        BEST_EPOCH = epoch
        save_model_with_meta(MODEL_PATH,
                             model,
                             optimizer,
                             {'train_acc': train_acc,
                             'val_acc': val_acc,
                             'train_loss': train_loss,
                             'val_loss': val_loss,
                             'best_epoch': BEST_EPOCH,
                             'additional_info': EXTRA_INFO})
        print('----- New best validation acc. Saving... -----')

    if (epoch - BEST_EPOCH) > PATIENCE:
        print(f"Finishing training, best validation acc: {BEST_VAL_ACC:.2f} at epoch: {BEST_EPOCH}")
        WRITER.close()
        break

#                         Starting Training!                         # 
Epoch: 0 => Train Acc: 0.00 | Train Loss: 1.10 | Avg time/img: 0.45 s
