In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import importlib
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

from model.model import get_model
from utils.gen_utils import get_logger

from datasets.Prisma3TT1T2Dataset import get_train_loaders
from model.losses import get_loss_criterion
from model.metrics import get_evaluation_metric
from model.trainer import get_trainer
from model.utils import get_number_of_learnable_parameters, get_tensorboard_formatter

# Create main logger
logger = get_logger('VaryTrainer')

# Top level configuration
import yaml
import config.runconfig as runconfig 
c = runconfig.Config_Run()

# Load and log experiment configuration
config = yaml.safe_load(open(c.config_file, 'r'))

# Set up GPU if available
DEFAULT_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
device = config.get('device', DEFAULT_DEVICE)
config['device'] = torch.device(device)
torch.cuda.set_device(0)
print(config['device'])


cuda:0


In [2]:
def _create_optimizer(config, model):
    assert 'optimizer' in config, 'Cannot find optimizer configuration'
    optimizer_config = config['optimizer']
    learning_rate = optimizer_config['learning_rate']
    weight_decay = optimizer_config.get('weight_decay', None)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    return optimizer

def _create_lr_scheduler(config, optimizer):
    lr_config = config.get('lr_scheduler', None)
    if lr_config is None:
        # use ReduceLROnPlateau as a default scheduler
        return ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=20, verbose=True)
    else:
        class_name = lr_config.pop('name')
        m = importlib.import_module('torch.optim.lr_scheduler')
        clazz = getattr(m, class_name)
        # add optimizer to the config
        lr_config['optimizer'] = optimizer
        return clazz(**lr_config)


In [3]:
# Create the model
model = get_model(config)
# put the model on GPUs
logger.info(f"Sending the model to '{config['device']}'")
model = model.to(config['device'])

# Log the number of learnable parameters
logger.info(f'Number of learnable params {get_number_of_learnable_parameters(model)}')

# Create loss criterion
loss_criterion = get_loss_criterion(config)

# Create evaluation metric
eval_criterion = get_evaluation_metric(config)

# Create the optimizer
optimizer = _create_optimizer(config, model)

# Create learning rate adjustment strategy
lr_scheduler = _create_lr_scheduler(config, optimizer)

# Create data loaders
loaders = get_train_loaders(config, c.train_patch_csv, c.val_patch_csv)

# Create model trainer
trainer = get_trainer(config)

# get tensorboard formatter
trainer_config = config['trainer']
tensorboard_formatter = get_tensorboard_formatter(trainer_config.get('tensorboard_formatter', None))

# Start training
trainer.fit(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, device=device, 
            loss_criterion=loss_criterion, eval_criterion=eval_criterion, loaders=loaders,
            logger=logger, checkpoint_dir = c.checkpointDir, tensorboard_formatter=tensorboard_formatter)

2021-07-19 10:55:45,322 [MainThread] INFO VaryTrainer - Sending the model to 'cuda:0'
2021-07-19 10:55:47,308 [MainThread] INFO VaryTrainer - Number of learnable params 9484459
2021-07-19 10:55:47,310 [MainThread] INFO Prisma3TT1T2Dataset - Creating training and validation set loaders...
2021-07-19 10:55:47,310 [MainThread] INFO Prisma3TT1T2Dataset - Number of workers for train/val dataloader: 2
2021-07-19 10:55:47,311 [MainThread] INFO Prisma3TT1T2Dataset - Batch size for train loader: 3
2021-07-19 10:55:47,311 [MainThread] INFO Prisma3TT1T2Dataset - Batch size for validation loader: 6
2021-07-19 10:55:47,327 [MainThread] INFO VaryTrainer - eval_score_higher_is_better: True
2021-07-19 10:55:47,327 [MainThread] INFO VaryTrainer - LabelfusionUNet3DLabelWeightFineTunningUNetPerChannelSkipMaskCoordMapsBothDeepSupervision4LevelReLu(
  (encoders): ModuleList(
    (0): DownConv(
      (conv1): Conv3d(7, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
      (conv2): Conv3d(32,

2021-07-19 10:56:07,912 [MainThread] INFO VaryTrainer - Training iteration 7. Batch 6. Epoch [0/19]
2021-07-19 10:56:11,727 [MainThread] INFO VaryTrainer - Training iteration 8. Batch 7. Epoch [0/19]
2021-07-19 10:56:15,560 [MainThread] INFO VaryTrainer - Training iteration 9. Batch 8. Epoch [0/19]
2021-07-19 10:56:18,802 [MainThread] INFO VaryTrainer - Training iteration 10. Batch 9. Epoch [0/19]
2021-07-19 10:56:27,467 [MainThread] INFO VaryTrainer - Training stats. Loss: 1.0283604919910432. Evaluation score: 0.8591499485596708


invalid value encountered in true_divide


2021-07-19 10:56:28,194 [MainThread] INFO VaryTrainer - Training iteration 11. Batch 10. Epoch [0/19]
2021-07-19 10:56:31,352 [MainThread] INFO VaryTrainer - Training iteration 12. Batch 11. Epoch [0/19]
2021-07-19 10:56:34,979 [MainThread] INFO VaryTrainer - Validating...
2021-07-19 10:56:35,581 [MainThread] INFO VaryTrainer - Validation iteration 0
2021-07-19 10:56:44,087 [MainThread] INFO VaryTrainer - Validation iteration 1
2021-07-19 10:56:50,967 [MainThread] INFO VaryTrainer - Validation iteration 2
2021-07-19 10:56:58,483 [MainThread] INFO VaryTrainer - Validation iteration 3
2021-07-19 10:57:05,127 [MainThread] INFO VaryTrainer - Validation iteration 4
2021-07-19 10:57:11,894 [MainThread] INFO VaryTrainer - Validation iteration 5
2021-07-19 10:57:18,481 [MainThread] INFO VaryTrainer - Validation iteration 6
2021-07-19 10:57:25,143 [MainThread] INFO VaryTrainer - Validation iteration 7
2021-07-19 10:57:33,002 [MainThread] INFO VaryTrainer - Validation iteration 8
2021-07-19 10:5

KeyboardInterrupt: 