In [1]:
import torch
from torch.backends import cudnn 
from torchvision import transforms
from torch.utils.data import DataLoader
cudnn.benchmark = True # might speed up runtime, test later or leave out

import os
import shutil
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import models
import losses
import datasets
from helpers import io, trainer, run_manager

# MODEL TRAINING

The purpose of this notebook is to test different model learning configurations. Outputs (model checkpoints, configuration dictionary) are saved in the specified directory below. Data path should lead to output folder created during preprocessing.


In [2]:
DATA_PATH = "/home/jovyan/work/DENMARK/256x256"
SAVE_PATH = "/home/jovyan/work/runs/MODELS"

In [3]:
EXP_ID = "lcfcn1"
EM = run_manager.Manager(EXP_ID, SAVE_PATH)


## Data

Dataset class takes list of image names as _images_ parameter. These lists for training and validation sets are created in the next cell from .txt files created during preprocessing in the _image_sets_ folder.

Preporcessing assignes image names to files based on what label-information is available for each image. This is done to get a predictable output for the dataloader.


In [4]:
# basic settings for dataset
EM.dataset_type = "denmark_points"
EM.batch_size_train = 1
EM.batch_size_val = 1
EM.n_classes = 2

# load image-lists from files
images_path = os.path.join(DATA_PATH, 'image_sets', 'points.txt')
images_list = [name.replace("\n","") for name in io.readText(images_path)]
train_size = round(len(images_list) * 0.7)
train_images = images_list[:train_size]
val_size = round(len(images_list) * 0.2)
val_images = images_list[train_size:(train_size + val_size)]

# create transformation object
transform_mean = [0.492, 0.475, 0.430]
transform_std = [0.176, 0.173, 0.176]

EM.transform = transforms.Compose([transforms.ToTensor(),
                                   transforms.Normalize(mean = transform_mean, 
                                                        std = transform_std)])

print(f"Dataset sizes: \n - train: {len(train_images)} \n - val: {len(val_images)}")

Dataset sizes: 
 - train: 451 
 - val: 129


In [5]:
train_set = datasets.getDataset(name = EM.dataset_type,
                                path = DATA_PATH,
                                images = train_images,
                                n_classes = EM.n_classes,
                                transform = EM.transform)

train_sampler = torch.utils.data.RandomSampler(train_set)

train_loader = DataLoader(train_set, sampler = train_sampler,
                          batch_size = EM.batch_size_train, 
                          drop_last = True, num_workers = 2)

val_set = datasets.getDataset(name = EM.dataset_type,
                              path = DATA_PATH,
                              images = val_images,
                              n_classes = EM.n_classes,
                              transform = EM.transform)

val_sampler = torch.utils.data.SequentialSampler(val_set)

val_loader = DataLoader(val_set, sampler = val_sampler,
                        batch_size = EM.batch_size_val,
                        num_workers = 2)

## Model

Model can be selected from _vgg16_, _resnet_ and _unet_. Available loss functions for point-supervision are _lcfcn_ and and _cob_. 

In [6]:
# basic settings for model
EM.net_name = 'vgg16'
EM.opt_name = 'adam'
EM.loss_name = 'lcfcn'

# optimizer-specific settings
EM.adam_learning_rate = 1e-5
EM.adam_betas = (0.99, 0.999)
EM.adam_decay = 0.0005 

In [7]:
model = models.getNet(EM.net_name, EM.n_classes).cuda()

criterion = losses.getLoss(EM.loss_name)

if EM.opt_name == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr = EM.adam_learning_rate, betas = EM.adam_betas, weight_decay = EM.adam_decay)
else:
    print("This is a sanity check for the experiment dictionary.")

## Run Management

Check if a previous run with the same ID exists and either load the last state dicts or move the run folder into the backup folder.

In [8]:
EM.begin()

if os.path.exists(EM.save_path):
    confirm = input("Run with same ID found - load, backup or cancel?: ")
    
    if confirm == 'load':
        # load state dicts
        checkpoint = torch.load(os.path.join(EM.save_path, 'checkpoint_last.pth'))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        # take epoch settings from manager
        manager = io.loadPKL(os.path.join(EM.save_path, 'manager.pkl'))
        EM.resume(manager)
        print(f"Loaded previous run - continuing from epoch {EM.current_epoch}...")
   
    elif confirm == 'backup':
        # move existing folder to backup subfolder
        shutil.move(EM.save_path, os.path.join(SAVE_PATH, 'backup', EM.id))
        print(f"Starting new run from epoch 0...")
    
    else:
        print("No action taken...")

## Main Epoch Loop

Each epoch conists of training, validation, updating the statstics and saving the best as well as the most recent model and validation statistics

In [9]:
EM.epochs = 5
os.mkdir(EM.save_path)

for epoch in tqdm(range(EM.current_epoch, EM.epochs)):
    
    # Training Phase
    train_loss = trainer.trainModel(model, optimizer, train_loader, criterion)
    print(f"Training done with loss: {train_loss}")
    
    # Validation Phase
    val_loss = trainer.valModel(model, val_loader, criterion)
    print(f"Validation done with loss: {val_loss}")
    
    # update experiment manager with losses
    loss_dict = {'epoch': epoch, 'train': train_loss, 'val': val_loss}
    EM.loss_list += [loss_dict]
    print("\n", pd.DataFrame(EM.loss_list).tail(), "\n")
    
    # save model optimizer and manager as checkpoint
    checkpoint = {'epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict()}
    torch.save(checkpoint, os.path.join(EM.save_path, 'checkpoint_last.pth'))
    io.savePKL(os.path.join(EM.save_path, 'manager.pkl'), EM)
    
    # check if new best model
    if epoch == 0 or train_loss < EM.best_loss:
        shutil.copy(os.path.join(EM.save_path, 'checkpoint_last.pth'), os.path.join(EM.save_path, 'checkpoint_best.pth'))
        EM.best_loss = train_loss
    print("Checkpoint saved... ")

print(f"Run completed!")

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/451 [00:00<?, ?it/s]

Training done with loss: 6.2771950854430445


  0%|          | 0/129 [00:00<?, ?it/s]

Validation done with loss: 4.002596998399542

    epoch     train       val
0      0  6.277195  4.002597 

Checkpoint saved... 


  0%|          | 0/451 [00:00<?, ?it/s]

Training done with loss: 6.027341973490831


  0%|          | 0/129 [00:00<?, ?it/s]

Validation done with loss: 3.9265935060589814

    epoch     train       val
0      0  6.277195  4.002597
1      1  6.027342  3.926594 

Checkpoint saved... 


  0%|          | 0/451 [00:00<?, ?it/s]

Training done with loss: 5.145236099440348


  0%|          | 0/129 [00:00<?, ?it/s]

Validation done with loss: 3.744081718630569

    epoch     train       val
0      0  6.277195  4.002597
1      1  6.027342  3.926594
2      2  5.145236  3.744082 

Checkpoint saved... 


  0%|          | 0/451 [00:00<?, ?it/s]

Training done with loss: 4.334796893606561


  0%|          | 0/129 [00:00<?, ?it/s]

Validation done with loss: 3.9450777063305065

    epoch     train       val
0      0  6.277195  4.002597
1      1  6.027342  3.926594
2      2  5.145236  3.744082
3      3  4.334797  3.945078 

Checkpoint saved... 


  0%|          | 0/451 [00:00<?, ?it/s]

Training done with loss: 3.6590152058044154


  0%|          | 0/129 [00:00<?, ?it/s]

Validation done with loss: 4.19382977270514

    epoch     train       val
0      0  6.277195  4.002597
1      1  6.027342  3.926594
2      2  5.145236  3.744082
3      3  4.334797  3.945078
4      4  3.659015  4.193830 

Checkpoint saved... 
Run completed!


In [None]:
####################### EXPERIMENT SETTINGS ################################
EXP_GROUPS = {}

EXP_GROUPS['denmark'] = {"dataset": {'name': 'denmark', 'transform': 'rgb_normalize'},
                         "model": {'name': 'lcfcn', 'base': "fcn8_vgg16"},
                         "batch_size": 1,
                         "max_epoch": 5,
                         "dataset_size": {'train': 'all', 'val': 'all'},
                         "optimizer": 'adam',
                         "lr": 1e-5}

EXP_GROUPS['denmark_debug'] = {"dataset": {'name': 'denmark', 'transform': 'rgb_normalize'},
                               "model": {'name': 'lcfcn', 'base': "fcn8_vgg16"},
                               "batch_size": 1,
                               "max_epoch": 5,
                               "dataset_size": {'train': 10, 'val': 5},
                               "optimizer": 'adam',
                               "lr": 1e-5}

EXP_GROUPS['denmark_cob'] = {"dataset": {'name': 'denmark', 'transform': 'rgb_normalize'},
                             "model": {'name': 'cob', 'base': "fcn8_vgg16"},
                             "batch_size": 1,
                             "max_epoch": 5,
                             "dataset_size": {'train': 'all', 'val': 'all'},
                             "optimizer": 'adam',
                             "lr": 1e-5}

EXP_GROUPS['denmark_sup'] = {"dataset": {'name': 'denmark', 'transform': 'rgb_normalize'},
                             "model": {'name': 'supervised', 'base': "unet"},
                             "batch_size": 1,
                             "max_epoch": 5,
                             "dataset_size": {'train': 'all', 'val': 'all'},
                             "optimizer": 'adam',
                             "lr": 1e-5}