<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setting-up-imports" data-toc-modified-id="Setting-up-imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setting up imports</a></span></li><li><span><a href="#Setting-up-Constant-Hyperparameters" data-toc-modified-id="Setting-up-Constant-Hyperparameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setting up Constant Hyperparameters</a></span></li><li><span><a href="#Setting-up-Parameters-and-Functions-for-Training" data-toc-modified-id="Setting-up-Parameters-and-Functions-for-Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Setting up Parameters and Functions for Training</a></span><ul class="toc-item"><li><span><a href="#Hyperparameters-Search-Space" data-toc-modified-id="Hyperparameters-Search-Space-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Hyperparameters Search Space</a></span></li><li><span><a href="#Creating-the-training-function" data-toc-modified-id="Creating-the-training-function-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Creating the training function</a></span></li><li><span><a href="#Creating-the-evaluation-function" data-toc-modified-id="Creating-the-evaluation-function-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Creating the evaluation function</a></span></li></ul></li><li><span><a href="#Running-the-training" data-toc-modified-id="Running-the-training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Running the training</a></span><ul class="toc-item"><li><span><a href="#Loading-data-for-training" data-toc-modified-id="Loading-data-for-training-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Loading data for training</a></span></li><li><span><a href="#Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm" data-toc-modified-id="Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Configuring the Tuner with a Scheduler and a Search Algorithm</a></span></li><li><span><a href="#Running-the-Tuner" data-toc-modified-id="Running-the-Tuner-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Running the Tuner</a></span></li></ul></li><li><span><a href="#Evaluating-the-best-Results" data-toc-modified-id="Evaluating-the-best-Results-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Evaluating the best Results</a></span></li></ul></div>

# Setting up imports

In [1]:
import os
from itertools import product

import torch
from torch.nn import CrossEntropyLoss, Sequential
from torch.nn.functional import normalize
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
from torchvision.transforms import CenterCrop, Resize, GaussianBlur
# from torchvision.transforms.functional import invert

import ray
from ray import tune
from ray.air import session, RunConfig, CheckpointConfig
from ray.air.checkpoint import Checkpoint
from ray.tune.schedulers import ASHAScheduler
# from ray.tune.search.hyperopt import HyperOptSearch
# from ray.tune.search.optuna import OptunaSearch
# from ray.tune.search import ConcurrencyLimiter


from dataset import POCDataReader, data_augment_, POCDataset
from metrics import Metrics, EvaluationMetrics
from models import UNet
from loss import *
from pipelines import InputPipeline, SequenceFilters, SumFilters
from pipelines.filters import *
from train import training_loop, validation_loop
from train_tqdm import evaluation_loop


# Setting up Constant Hyperparameters

In [2]:
EPOCHS = 20
NUM_SAMPLES = 1

NUM_AUGMENT = 1

LOAD_DATA_ON_GPU = False
GPUS_PER_TRIAL = 1
CPUS_PER_TRIAL = 4

##### Selecting Cuda device

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Setting up Parameters and Functions for Training

## Hyperparameters Search Space

##### Preload Losses Functions

In [4]:
pixel_loss_list = [
    CrossEntropyLoss(weight=torch.tensor([.3, .7])),
    FocalLoss(weight=torch.tensor([.3, .7]), gamma=2),
]

volume_loss_list = [
    JaccardLoss(),
    TverskyLoss(alpha=0.3, beta=0.7),
    FocalTverskyLoss(alpha=0.3, beta=0.7, gamma=2),
]

loss_list = []
for ploss in pixel_loss_list:
    loss_list.append(PixelLoss(pixel_loss=ploss, volume_loss=None))
for vloss in volume_loss_list:
    loss_list.append(VolumeLoss(pixel_loss=None, volume_loss=vloss))
for (ploss, vloss) in product(pixel_loss_list, volume_loss_list):
    loss_list.append(CombinedLoss(loss1=ploss, loss2=vloss, ratio=0.3))
    loss_list.append(BorderedLoss(border_loss=ploss, volume_loss=vloss, ratio=0.7))

##### Preload Filter

In [5]:
filter_list = [normalize] #, invert]

layer_list = [
    SobelFilter(),
    LaplacianFilter(),
    FrangiFilter(),
    SatoFilter(),
    SumFilters(FrangiFilter(), SatoFilter()),
    SkeletonFilter(SequenceFilters(SumFilters(FrangiFilter(), SatoFilter()), CrackBinaryFilter())),
]

pipeline_list = []
for f, l in product(filter_list, layer_list):
    pipeline_list.append(InputPipeline(transformer=f, layer_transformer=l))

##### Search Space

In [6]:
search_space = {
    "Network": UNet, #tune.grid_search([Unet, DeepCrack, SubUNet]),
    "Optimizer": Adam,

    "Learning Rate": 1e-4, #tune.loguniform(1e-6, 1e-4),
    "Batch Size": 4,           #tune.qrandint(2, 8, 2),

    "Loss Function": tune.grid_search(loss_list),

    "Negative Mining": True, #tune.choice([True, False]),
    "Smooth Labeling": False, #tune.choice([True, False]),

    "Input Pipeline": tune.grid_search(pipeline_list),
}

## Creating the training function

In [7]:
def train(config, train_data, val_data):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    inpip = config["Input Pipeline"]
    if LOAD_DATA_ON_GPU:
        inpip = inpip.to(device)

    train_dataset = POCDataset(
        train_data,
        transform=Sequential(inpip, CenterCrop(size=(480, 480)), Resize(size=(400, 400))),
        target_transform= Sequential(
            GaussianBlur(kernel_size=3, sigma=0.7),
            CenterCrop(size=(480, 480)),
            Resize(size=(400, 400)),
        ) if config["Smooth Labeling"] else Sequential(
            CenterCrop(size=(480, 480)),
            Resize(size=(400, 400)),
        ),
        negative_mining=config["Negative Mining"],
        load_on_gpu=LOAD_DATA_ON_GPU)
    train_dataset.precompute_transform()

    if LOAD_DATA_ON_GPU:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["Batch Size"]),
            sampler=train_dataset.sampler,
            shuffle= True if train_dataset.sampler is None else None,
        )
    else:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["Batch Size"]),
            sampler=train_dataset.sampler,
            shuffle= True if train_dataset.sampler is None else None,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    val_dataset = POCDataset(
        val_data, 
        transform=Sequential(inpip, CenterCrop(size=(480, 480)), Resize(size=(400, 400))),
        target_transform=Sequential(CenterCrop(size=(480, 480)), Resize(size=(400, 400))),
        negative_mining=False,
        load_on_gpu=LOAD_DATA_ON_GPU)
    val_dataset.precompute_transform()
    
    if LOAD_DATA_ON_GPU:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["Batch Size"]),
            shuffle=True)
    else:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["Batch Size"]),
            shuffle=True,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    model = config["Network"](n_channels=inpip.nb_channel, n_classes=2)
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    loss_fn = config["Loss Function"].to(device)
    optimizer = config["Optimizer"](model.parameters(), lr=config["Learning Rate"], betas=(0.9, 0.99))
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state, scheduler_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        lr_scheduler.load_state_dict(scheduler_state)

    train_metrics = Metrics(
        buffer_size=len(training_dataloader),
        mode="Training",
        hyperparam=config,
        device=device)

    val_metrics = Metrics(
        buffer_size=len(validation_dataloader),
        mode="Validation",
        hyperparam=config,
        device=device)


    for epoch in range(1, EPOCHS+1):  # loop over the dataset multiple times
        training_loop(epoch, training_dataloader, model, loss_fn, optimizer, lr_scheduler, train_metrics, device)
        validation_loop(epoch, validation_dataloader, model, loss_fn, val_metrics, device)

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and can be accessed through `session.get_checkpoint()`
        # API in future iterations.
        os.makedirs("model", exist_ok=True)
        torch.save((model.state_dict(), optimizer.state_dict(), lr_scheduler.state_dict()), "model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("model")
        session.report(metrics=val_metrics.get_metrics(epoch), checkpoint=checkpoint)

    train_metrics.close_tensorboard()
    val_metrics.close_tensorboard()


## Creating the evaluation function

In [8]:
def evaluate(test_data, result):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    inpip = config["Input Pipeline"]
    if LOAD_DATA_ON_GPU:
        inpip = inpip.to(device)

    test_dataset = POCDataset(
        test_data,
        transform=Sequential(inpip, CenterCrop(size=(480, 480)), Resize(size=(400, 400))),
        target_transform=Sequential(CenterCrop(size=(480, 480)), Resize(size=(400, 400))),
        negative_mining=False,
        load_on_gpu=LOAD_DATA_ON_GPU)
    
    if LOAD_DATA_ON_GPU:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
    else:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=2*CPUS_PER_TRIAL, pin_memory=True, pin_memory_device=device)

    best_trained_model = result.config["Network"](n_channels=inpip.nb_channel, n_classes=2, bilinear=True, crop=False).to(device)

    checkpoint_path = os.path.join(result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, _, _ = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    test_metrics = EvaluationMetrics(
        buffer_size=len(evaluation_dataloader),
        hyperparam=result.config,
        epochs=result.metrics["Epoch"],
        device=device)

    evaluation_loop(dataloader=evaluation_dataloader, model=best_trained_model, metric=test_metrics, device=device)


# Running the training

## Loading data for training

In [9]:
data_reader = POCDataReader(root_dir="../data/POC", load_on_gpu=False, verbose=True)
train_data, val_data, test_data = data_reader.split([0.7, 0.1, 0.2])
data_augment_(train_data, n=NUM_AUGMENT, load_on_gpu=False, verbose=True)

Loading dataset into RAM:   0%|          | 0/2744 [00:00<?, ?it/s]

	- Loading done, RAM used: 5.40GiB / free: 34.27GiB / total: 62.73GiB
	- Got a total of 2744 images.


Expending the dataset 1 more times:   0%|          | 0/1920 [00:00<?, ?it/s]

	- Augmentation done, RAM used: 8.16GiB / free: 31.51GiB / total: 62.73GiB
	- Got 1920 new images and a total of 3840 images.


## Configuring the Tuner with a Scheduler and a Search Algorithm

In [10]:
# scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=2, reduction_factor=2)
# search_algo = HyperOptSearch()
# search_algo = OptunaSearch()

tune_config = tune.TuneConfig(
    metric="CrackIoU",
    mode="max",
    num_samples=NUM_SAMPLES,
#     scheduler=scheduler,
#     search_alg=search_algo,
)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train, train_data=train_data, val_data=val_data),
        resources={"cpu": CPUS_PER_TRIAL, "gpu": GPUS_PER_TRIAL}),
    tune_config=tune_config,
    param_space=search_space,
    run_config=RunConfig(
        local_dir="~/Documents/POC-Project/ray_results",
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="CrackIoU",
            checkpoint_score_order="max")))

## Running the Tuner

In [11]:
results = tuner.fit()

2023-04-07 12:34:05,842	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Current time:,2023-04-10 01:42:58
Running for:,"2 days, 13:08:43.35"
Memory:,28.9/62.7 GiB

Trial name,# failures,error file
train_12206_00057,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00057_57_Input_Pipeline=InputPipeline_normalize_SatoFilter,Loss_Function=CombinedLoss_CrossEntropyLoss_FocalTverskyLos_2023-04-09_19-05-22/error.txt"
train_12206_00058,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00058_58_Input_Pipeline=InputPipeline_normalize_SumFilters_FrangiFilter_SatoFilter,Loss_Function=CombinedLoss_CrossEnt_2023-04-09_19-59-08/error.txt"
train_12206_00059,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00059_59_Input_Pipeline=InputPipeline_normalize_SkeletonFilter,Loss_Function=CombinedLoss_CrossEntropyLoss_FocalTversk_2023-04-09_19-59-30/error.txt"
train_12206_00060,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00060_60_Input_Pipeline=InputPipeline_normalize_SobelFilter,Loss_Function=BorderedLoss_CrossEntropyLoss_FocalTverskyLo_2023-04-09_19-59-30/error.txt"
train_12206_00061,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00061_61_Input_Pipeline=InputPipeline_normalize_LaplacianFilter,Loss_Function=BorderedLoss_CrossEntropyLoss_FocalTvers_2023-04-09_20-01-34/error.txt"
train_12206_00062,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00062_62_Input_Pipeline=InputPipeline_normalize_FrangiFilter,Loss_Function=BorderedLoss_CrossEntropyLoss_FocalTverskyL_2023-04-09_20-05-59/error.txt"
train_12206_00063,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00063_63_Input_Pipeline=InputPipeline_normalize_SatoFilter,Loss_Function=BorderedLoss_CrossEntropyLoss_FocalTverskyLos_2023-04-09_20-21-39/error.txt"
train_12206_00064,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00064_64_Input_Pipeline=InputPipeline_normalize_SumFilters_FrangiFilter_SatoFilter,Loss_Function=BorderedLoss_CrossEnt_2023-04-09_20-30-15/error.txt"
train_12206_00065,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00065_65_Input_Pipeline=InputPipeline_normalize_SkeletonFilter,Loss_Function=BorderedLoss_CrossEntropyLoss_FocalTversk_2023-04-09_20-34-16/error.txt"
train_12206_00066,1,"/home/piai/Documents/POC-Project/ray_results/train_2023-04-07_12-33-56/train_12206_00066_66_Input_Pipeline=InputPipeline_normalize_SobelFilter,Loss_Function=CombinedLoss_FocalLoss_JaccardLoss_2023-04-09_20-57-11/error.txt"

Trial name,status,loc,Input Pipeline,Loss Function,iter,total time (s),Epoch,Loss,CrackIoU
train_12206_00000,TERMINATED,141.223.107.22:16771,InputPipeline(n_f100,PixelLoss:Cross_d5d0,20.0,5857.44,20.0,0.0976571,0.74874
train_12206_00001,TERMINATED,141.223.107.22:16816,InputPipeline(n_d720,PixelLoss:Cross_d2a0,20.0,6606.41,20.0,0.0977389,0.739624
train_12206_00002,TERMINATED,141.223.107.22:16771,InputPipeline(n_e860,PixelLoss:Cross_cb50,20.0,6662.15,20.0,0.097758,0.738588
train_12206_00003,TERMINATED,141.223.107.22:16816,InputPipeline(n_e890,PixelLoss:Cross_ea40,20.0,7076.28,20.0,0.0977079,0.743115
train_12206_00004,TERMINATED,141.223.107.22:16771,InputPipeline(n_b250,PixelLoss:Cross_e0e0,20.0,7366.22,20.0,0.0977144,0.734599
train_12206_00005,TERMINATED,141.223.107.22:16816,InputPipeline(n_b100,PixelLoss:Cross_b670,20.0,8263.78,20.0,0.0977378,0.738433
train_12206_00006,TERMINATED,141.223.107.22:16771,InputPipeline(n_bd00,PixelLoss:FocalLoss,20.0,5894.06,20.0,0.00770159,0.73222
train_12206_00007,TERMINATED,141.223.107.22:16816,InputPipeline(n_8430,PixelLoss:FocalLoss,20.0,6634.1,20.0,0.00773644,0.729173
train_12206_00008,TERMINATED,141.223.107.22:16771,InputPipeline(n_91b0,PixelLoss:FocalLoss,20.0,6689.24,20.0,0.00777491,0.718509
train_12206_00009,TERMINATED,141.223.107.22:16816,InputPipeline(n_a2c0,PixelLoss:FocalLoss,20.0,7089.16,20.0,0.00775036,0.727848


Trial name,date,experiment_id,hostname,node_ip,pid,timestamp,trial_id
train_12206_00000,2023-04-07_14-11-57,a6b2cb5999574c94930a0ce71be3c4ef,piai-Precision-Tower-7910,141.223.107.22,16771,1680844317,12206_00000
train_12206_00001,2023-04-07_14-24-30,3341b9c1035f4e63a28414608ccda31f,piai-Precision-Tower-7910,141.223.107.22,16816,1680845070,12206_00001
train_12206_00002,2023-04-07_16-02-59,a6b2cb5999574c94930a0ce71be3c4ef,piai-Precision-Tower-7910,141.223.107.22,16771,1680850979,12206_00002
train_12206_00003,2023-04-07_16-22-27,3341b9c1035f4e63a28414608ccda31f,piai-Precision-Tower-7910,141.223.107.22,16816,1680852147,12206_00003
train_12206_00004,2023-04-07_18-05-46,a6b2cb5999574c94930a0ce71be3c4ef,piai-Precision-Tower-7910,141.223.107.22,16771,1680858346,12206_00004
train_12206_00005,2023-04-07_18-40-11,3341b9c1035f4e63a28414608ccda31f,piai-Precision-Tower-7910,141.223.107.22,16816,1680860411,12206_00005
train_12206_00006,2023-04-07_19-44-00,a6b2cb5999574c94930a0ce71be3c4ef,piai-Precision-Tower-7910,141.223.107.22,16771,1680864240,12206_00006
train_12206_00007,2023-04-07_20-30-45,3341b9c1035f4e63a28414608ccda31f,piai-Precision-Tower-7910,141.223.107.22,16816,1680867045,12206_00007
train_12206_00008,2023-04-07_21-35-30,a6b2cb5999574c94930a0ce71be3c4ef,piai-Precision-Tower-7910,141.223.107.22,16771,1680870930,12206_00008
train_12206_00009,2023-04-07_22-28-55,3341b9c1035f4e63a28414608ccda31f,piai-Precision-Tower-7910,141.223.107.22,16816,1680874135,12206_00009


2023-04-07 14:11:57,700	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): SobelFilter
  )
), 'Loss Function': PixelLoss(
  (pixel_loss): CrossEntropyLoss()
)}
2023-04-07 14:24:30,765	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): LaplacianFilter
  )
), 'Loss Function': PixelLoss(
  (pixel_loss): CrossEntropyLoss()
)}
2023-04-07 16:03:00,167	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): FrangiFilter
  )
), 'Loss Function': PixelLoss(
  (pixel_loss): CrossEntropyLoss()
)}
2023-04-07 16:22:27,309	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Inpu

2023-04-08 18:02:18,800	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): SkeletonFilter
  )
), 'Loss Function': VolumeLoss(
  (volume_loss): FocalTverskyLoss
)}
2023-04-08 18:56:29,492	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): SobelFilter
  )
), 'Loss Function': CombinedLoss(
  (loss1): CrossEntropyLoss()
  (loss2): JaccardLoss
)}
2023-04-08 19:42:53,969	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): LaplacianFilter
  )
), 'Loss Function': CombinedLoss(
  (loss1): CrossEntropyLoss()
  (loss2): JaccardLoss
)}
2023-04-08 20:57:22,230	INFO tensorboardx.py:267 -- Removed the following hyperparamete

2023-04-09 19:05:22,110	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): LaplacianFilter
  )
), 'Loss Function': CombinedLoss(
  (loss1): CrossEntropyLoss()
  (loss2): FocalTverskyLoss
)}
2023-04-09 19:59:08,093	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): FrangiFilter
  )
), 'Loss Function': CombinedLoss(
  (loss1): CrossEntropyLoss()
  (loss2): FocalTverskyLoss
)}
[2m[33m(raylet)[0m [2023-04-09 19:59:11,939 E 16313 16313] (raylet) node_manager.cc:3040: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: f23f2b0e98a5dbe3f362b7b0159eafdd21a7c11a8d8772aa42c80af4, IP: 141.223.107.22) over the last time period. To see more information about the Workers killed on t

[2m[36m(train pid=31198)[0m   return torch._C._cuda_getDeviceCount() > 0
[2m[36m(train pid=31196)[0m   return torch._C._cuda_getDeviceCount() > 0
[2m[33m(raylet)[0m [2023-04-09 20:00:11,941 E 16313 16313] (raylet) node_manager.cc:3040: 3 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: f23f2b0e98a5dbe3f362b7b0159eafdd21a7c11a8d8772aa42c80af4, IP: 141.223.107.22) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 141.223.107.22`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker k

[2m[36m(train pid=31857)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 20:30:14,261	ERROR trial_runner.py:1062 -- Trial train_12206_00059: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=31196, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/

[2m[36m(train pid=32653)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 20:59:39,476	ERROR trial_runner.py:1062 -- Trial train_12206_00066: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=32653, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/

[2m[36m(train pid=631)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 21:17:25,638	ERROR trial_runner.py:1062 -- Trial train_12206_00069: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=631, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/envs

[2m[36m(train pid=1579)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 21:47:05,672	ERROR trial_runner.py:1062 -- Trial train_12206_00072: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=1579, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=1981)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 22:03:56,622	ERROR trial_runner.py:1062 -- Trial train_12206_00075: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=1981, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=3273)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 22:33:34,374	ERROR trial_runner.py:1062 -- Trial train_12206_00078: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=3273, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=3662)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 22:50:40,120	ERROR trial_runner.py:1062 -- Trial train_12206_00081: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=3662, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=4526)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 23:20:14,935	ERROR trial_runner.py:1062 -- Trial train_12206_00084: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=4526, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=4934)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-09 23:37:21,795	ERROR trial_runner.py:1062 -- Trial train_12206_00087: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=4934, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=5796)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-10 00:06:51,723	ERROR trial_runner.py:1062 -- Trial train_12206_00090: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=5796, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=6292)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-10 00:24:11,725	ERROR trial_runner.py:1062 -- Trial train_12206_00093: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=6292, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=7144)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-10 00:53:46,348	ERROR trial_runner.py:1062 -- Trial train_12206_00096: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=7144, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

[2m[36m(train pid=7531)[0m   return torch._C._cuda_getDeviceCount() > 0
2023-04-10 01:11:04,328	ERROR trial_runner.py:1062 -- Trial train_12206_00099: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=7531, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/en

2023-04-10 01:42:58,767	ERROR trial_runner.py:1062 -- Trial train_12206_00101: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=7920, ip=141.223.107.22, repr=train)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/piai/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 3

# Evaluating the best Results

In [12]:
best_result = results.get_best_result(metric="CrackIoU", mode="max", scope="all")  # Get best result object
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(best_result.metrics["Loss"]))
print("Best trial final validation CrackIoU: {}".format(best_result.metrics["CrackIoU"]))

for result in results:
    evaluate(test_data=test_data, result=result)

Best trial config: {'Network': <class 'models.unet.UNet'>, 'Optimizer': <class 'torch.optim.adam.Adam'>, 'Learning Rate': 0.0001, 'Batch Size': 4, 'Loss Function': BorderedLoss(
  (border_loss): CrossEntropyLoss()
  (volume_loss): JaccardLoss
), 'Negative Mining': True, 'Smooth Labeling': False, 'Input Pipeline': InputPipeline(
  (layer_transformer): ModuleList(
    (0): SobelFilter
  )
)}
Best trial final validation loss: 0.08369708806276321
Best trial final validation CrackIoU: 0.7667993903160095


NameError: name 'config' is not defined