# MODEL TRAINING

The purpose of this notebook is to train different model learning configurations. Outputs (model checkpoints, configuration dictionary) are saved in the specified directory below. Data path should lead to output folder created during preprocessing.


In [3]:
import torch
import torchvision
from torch.backends import cudnn 
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
cudnn.benchmark = True # might speed up runtime

import os
import shutil
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import models
import losses
import datasets
from helpers import io, trainer, run_manager

In [4]:
EXP_ID = "buildings_points_256_10"

DATA_PATH = "/home/jovyan/work/processed/256x256"
SAVE_PATH = "/home/jovyan/work/runs"

EM = run_manager.Manager(EXP_ID, SAVE_PATH)
TB = SummaryWriter(EM.save_path)


## Data

Dataset class takes list of image names as _images_ parameter. These lists for training and validation sets are created in the next cell from .txt files created during preprocessing in the _image_sets_ folder.

Preporcessing assignes image names to files based on what label-information is available for each image. This is done to get a predictable output for the dataloader.


In [17]:
# basic settings for dataset
EM.object_type = "buildings"
EM.dataset_type = "denmark_points" # denmark_points, denmark_points_cob, denmark_shapes, 
EM.n_classes = 2 # 0: background, 1: object
EM.batch_size_train = 1
EM.batch_size_val = 1

# load image-lists from files
images_path = os.path.join(DATA_PATH, 'image_sets_'+EM.object_type, 'points.txt')
images_list_all = [name.replace("\n","") for name in io.readText(images_path)]
#images_path = os.path.join(DATA_PATH, 'image_sets_'+EM.object_type, 'points.txt')
#images_list_points = [name.replace("\n","") for name in io.readText(images_path)]
#images_list_no_points = list(set(images_list_all) - set(images_list_points))

train_size = round(len(images_list_all) * 0.8)
train_images = images_list[:train_size]
val_size = round(len(images_list) * 0.1)
val_images = images_list[train_size:(train_size + val_size)]

# create transformation object
transform_mean = [0.492, 0.475, 0.430] # from preprocessing
transform_std = [0.176, 0.173, 0.176]

EM.transform = transforms.Compose([transforms.ToTensor(),
                                   transforms.Normalize(mean = transform_mean, 
                                                        std = transform_std)])

print(f"Dataset sizes: \n - train: {len(train_images)} \n - val: {len(val_images)}")

Dataset sizes: 
 - train: 64228 
 - val: 13600


In [18]:
train_set = datasets.getDataset(name = EM.dataset_type,
                                path = DATA_PATH,
                                images = train_images,
                                object_type = EM.object_type,
                                n_classes = EM.n_classes,
                                transform = EM.transform)

train_sampler = torch.utils.data.RandomSampler(train_set)

train_loader = DataLoader(train_set, sampler = train_sampler,
                          batch_size = EM.batch_size_train, 
                          drop_last = True, num_workers = 2)

val_set = datasets.getDataset(name = EM.dataset_type,
                              path = DATA_PATH,
                              images = val_images,
                              object_type = EM.object_type,
                              n_classes = EM.n_classes,
                              transform = EM.transform)

val_sampler = torch.utils.data.SequentialSampler(val_set)

val_loader = DataLoader(val_set, sampler = val_sampler,
                        batch_size = EM.batch_size_val,
                        num_workers = 2)

print("Dataloaders ready...")

Dataloaders ready...


## Model

Model can be selected from _vgg16_, _resnet_ and _unet_. Available loss functions for point-supervision are _lcfcn_ and and _cob_. 

In [19]:
# basic settings for model
EM.type = 'point' # point, point_cob, supervised
EM.net_name = 'vgg16' # vgg16 for point & point_cob, unet for supervised (resnet available)
EM.opt_name = 'adam' # adam for point & point_cob, sgd for supervised
EM.loss_name = 'point' #  point, point_cob (custom) and BCELoss, CrossEntropy (stock)

# optimizer-specific settings
EM.adam_learning_rate = 1e-5
EM.adam_betas = (0.99, 0.999)
EM.adam_decay = 0.0005 

In [20]:
model = models.getNet(EM.net_name, EM.n_classes).cuda()

criterion = losses.getLoss(EM.loss_name)

if EM.opt_name == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr = EM.adam_learning_rate, betas = EM.adam_betas, weight_decay = EM.adam_decay)
elif EM.opt_name == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(), lr = EM.adam_learning_rate)
else:
    raise ValueError("not implemented, check settings")
print("Model ready...")

Model ready...


## Run Management

Check if a previous run with the same ID exists and either load the last state dicts or move the run folder into the backup folder.

In [21]:
EM.begin()

if os.path.exists(os.path.join(EM.save_path, 'checkpoint_last.pth')):
    confirm = input("Saved run with same ID found - load (l), rename (r) or cancel (c)?: ")
    
    if confirm == 'load' or confirm == 'l':
        # load state dicts
        checkpoint = torch.load(os.path.join(EM.save_path, 'checkpoint_last.pth'))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        # take epoch settings from manager
        manager = io.loadPKL(os.path.join(EM.save_path, 'manager.pkl'))
        EM.resume(manager)
        print(f"Loaded previous run - continuing from epoch {EM.current_epoch}...")
   
    elif confirm == 'rename' or confirm == 'r':
        # rename existing experiment
        TB.close()
        os.rename(EM.save_path, os.path.join(SAVE_PATH, EM.id+"_"+str(np.random.randint(100, 999))))
        TB = SummaryWriter(EM.save_path)
        print(f"Starting new run from epoch 0...")
    
    else:
        print("No action taken...")
else:
    print(f"Starting new run from epoch 0...")

Starting new run from epoch 0...


## Main Epoch Loop

Each epoch conists of training, validation, updating the statstics and saving the best as well as the most recent model and validation statistics

In [None]:
EM.epochs = 25
start_epoch = EM.current_epoch

for epoch in tqdm(range(start_epoch, EM.epochs)):
    
    # Training Phase
    train_loss = trainer.trainModel(model, optimizer, train_loader, criterion, EM.type)
    TB.add_scalar('training loss', train_loss, epoch)
    print(f"Training done with loss: {train_loss}")
    
    # Validation Phase
    val_loss = trainer.valModel(model, val_loader, criterion, EM.type)
    TB.add_scalar('validation loss', val_loss, epoch)
    print(f"Validation done with loss: {val_loss}")
    
    # update experiment manager with losses
    loss_dict = {'epoch': epoch+1, 'train': train_loss, 'val': val_loss}
    EM.loss_list += [loss_dict]
    EM.current_epoch = epoch
    print("\n", pd.DataFrame(EM.loss_list).tail(), "\n")
    
    # save model optimizer and manager as checkpoint
    checkpoint = {'epoch': epoch+1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict()}
    torch.save(checkpoint, os.path.join(EM.save_path, 'checkpoint_last.pth'))
    io.savePKL(os.path.join(EM.save_path, 'manager.pkl'), EM)
    
    # check if new best model
    if epoch == 0 or val_loss < EM.best_loss:
        shutil.copy(os.path.join(EM.save_path, 'checkpoint_last.pth'), os.path.join(EM.save_path, 'checkpoint_best.pth'))
        EM.best_loss = val_loss
    print("Checkpoint saved... ")

print(f"Run completed!")
TB.close()

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/64228 [00:00<?, ?it/s]

# EXPLORATION REGION