<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setting-up-imports" data-toc-modified-id="Setting-up-imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setting up imports</a></span></li><li><span><a href="#Setting-up-Constant-Hyperparameters" data-toc-modified-id="Setting-up-Constant-Hyperparameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setting up Constant Hyperparameters</a></span></li><li><span><a href="#Setting-up-Parameters-and-Functions-for-Training" data-toc-modified-id="Setting-up-Parameters-and-Functions-for-Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Setting up Parameters and Functions for Training</a></span><ul class="toc-item"><li><span><a href="#Hyperparameters-Search-Space" data-toc-modified-id="Hyperparameters-Search-Space-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Hyperparameters Search Space</a></span></li><li><span><a href="#Creating-the-training-function" data-toc-modified-id="Creating-the-training-function-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Creating the training function</a></span></li><li><span><a href="#Creating-the-evaluation-function" data-toc-modified-id="Creating-the-evaluation-function-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Creating the evaluation function</a></span></li></ul></li><li><span><a href="#Running-the-training" data-toc-modified-id="Running-the-training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Running the training</a></span><ul class="toc-item"><li><span><a href="#Loading-data-for-training" data-toc-modified-id="Loading-data-for-training-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Loading data for training</a></span></li><li><span><a href="#Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm" data-toc-modified-id="Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Configuring the Tuner with a Scheduler and a Search Algorithm</a></span></li><li><span><a href="#Running-the-Tuner" data-toc-modified-id="Running-the-Tuner-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Running the Tuner</a></span></li></ul></li><li><span><a href="#Evaluating-the-best-Results" data-toc-modified-id="Evaluating-the-best-Results-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Evaluating the best Results</a></span></li></ul></div>

# Setting up imports

In [1]:
import os
from itertools import product

import torch
from torch.nn import CrossEntropyLoss, Sequential
from torch.nn.functional import normalize
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
from torchvision.transforms import CenterCrop, Resize, GaussianBlur
# from torchvision.transforms.functional import invert

import ray
from ray import tune
from ray.air import session, RunConfig, CheckpointConfig
from ray.air.checkpoint import Checkpoint
from ray.tune.schedulers import ASHAScheduler
# from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.search.optuna import OptunaSearch
from ray.tune.search import ConcurrencyLimiter


from dataset import POCDataReader, data_augment_, POCDataset
from metrics import Metrics, EvaluationMetrics
from models import UNet, DeepCrack, SubUNet
from loss import *
from pipelines import InputPipeline, SequenceFilters, SumFilters
from pipelines.filters import *
from train import training_loop, validation_loop
from train_tqdm import evaluation_loop


# Setting up Constant Hyperparameters

In [2]:
EPOCHS = 20
NUM_SAMPLES = 150

NUM_AUGMENT = 1

LOAD_DATA_ON_GPU = True
GPUS_PER_TRIAL = 1
CPUS_PER_TRIAL = 20

##### Selecting Cuda device

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Setting up Parameters and Functions for Training

## Hyperparameters Search Space

##### Preload Losses Functions

In [None]:
pixel_loss_list = [
    CrossEntropyLoss(weight=torch.tensor([.3, .7])),
    FocalLoss(weight=torch.tensor([.3, .7]), gamma=2),
]

volume_loss_list = [
    JaccardLoss(),
    TverskyLoss(alpha=0.3, beta=0.7),
    FocalTverskyLoss(alpha=0.3, beta=0.7, gamma=2),
]

loss_list = []
for ploss in pixel_loss_list:
    loss_list.append(PixelLoss(pixel_loss=ploss, volume_loss=None))
for vloss in volume_loss_list:
    loss_list.append(VolumeLoss(pixel_loss=None, volume_loss=vloss))
for (ploss, vloss) in product(pixel_loss_list, volume_loss_list):
    loss_list.append(CombinedLoss(loss1=ploss, loss2=vloss, ratio=0.3))
    loss_list.append(BorderedLoss(border_loss=ploss, volume_loss=vloss, ratio=0.7))

##### Preload Pipeline

In [None]:
filter_list = [normalize] #, invert]

layer_list = [
    SobelFilter(),
    LaplacianFilter(),
    FrangiFilter(),
    SatoFilter(),
    SumFilters(FrangiFilter(), SatoFilter()),
    SkeletonFilter(SequenceFilters(SumFilters(FrangiFilter(), SatoFilter()), CrackBinaryFilter())),
]

pipeline_list = []
for f, l in product(filter_list, layer_list):
    pipeline_list.append(InputPipeline(transformer=f, layer_transformer=l))
    
no_layer_pip = InputPipeline(transformer=[normalize], layer_transformer=None)

##### Search Space

In [4]:
search_space = {
    "Network": SubUNet, #tune.grid_search([UNet, DeepCrack, SubUNet]),
    "Optimizer": Adam,

    "Learning Rate": tune.loguniform(1e-6, 1e-3),
    "Batch Size": 4,

#     "Loss Function": tune.grid_search(loss_list),
    "Loss Combiner": BorderedLoss,
    "Loss Combiner_ratio": tune.uniform(0, 1),
    "Loss Volume": JaccardLoss,
    "Loss Pixel": FocalLoss,
    "Loss Pixel_gamma": tune.uniform(0, 5),
    "Loss Pixel_weight": tune.uniform(0, 1),

    "Negative Mining": False, #tune.choice([True, False]),
    "Smooth Labeling": False, #tune.choice([True, False]),

#     "Input Pipeline": tune.grid_search(pipeline_list),
    "Pipe Filter": normalize,
    "Pipe Layer": LaplacianFilter,
    "Pipe Layer_threshold": tune.loguniform(0.5, 5),
}

## Creating the training function

In [5]:
def train(config, train_data, val_data):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    inpip = InputPipeline(
        transformer=config["Pipe Filter"],
        layer_transformer=config["Pipe Layer"](threshold=config["Pipe Layer_threshold"]))
    if LOAD_DATA_ON_GPU:
        inpip = inpip.to(device)

    train_dataset = POCDataset(
        train_data,
        transform=Sequential(inpip, CenterCrop(size=(480, 480)), Resize(size=(384, 384))),
        target_transform= Sequential(
            GaussianBlur(kernel_size=3, sigma=0.7),
            CenterCrop(size=(480, 480)),
            Resize(size=(384, 384)),
        ) if config["Smooth Labeling"] else Sequential(
            CenterCrop(size=(480, 480)),
            Resize(size=(384, 384)),
        ),
        negative_mining=config["Negative Mining"],
        load_on_gpu=LOAD_DATA_ON_GPU)
    train_dataset.precompute_transform()

    if LOAD_DATA_ON_GPU:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["Batch Size"]),
            sampler=train_dataset.sampler,
            shuffle= True if train_dataset.sampler is None else None,
        )
    else:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["Batch Size"]),
            sampler=train_dataset.sampler,
            shuffle= True if train_dataset.sampler is None else None,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    val_dataset = POCDataset(
        val_data, 
        transform=Sequential(inpip, CenterCrop(size=(480, 480)), Resize(size=(384, 384))),
        target_transform=Sequential(CenterCrop(size=(480, 480)), Resize(size=(384, 384))),
        negative_mining=False,
        load_on_gpu=LOAD_DATA_ON_GPU)
    val_dataset.precompute_transform()
    
    if LOAD_DATA_ON_GPU:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["Batch Size"]),
            shuffle=True)
    else:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["Batch Size"]),
            shuffle=True,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    model = config["Network"](n_channels=inpip.nb_channel, n_classes=2)
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    pixel_loss_fn = config["Loss Pixel"](
        weight=torch.tensor([config["Loss Pixel_weight"], 1.0 - config["Loss Pixel_weight"]]),
        gamma=config["Loss Pixel_gamma"])
    volume_loss_fn = config["Loss Volume"]()
    loss_fn = config["Loss Combiner"](
        border_loss=pixel_loss_fn,
        volume_loss=volume_loss_fn,
        ratio=config["Loss Combiner_ratio"]).to(device)

    optimizer = config["Optimizer"](model.parameters(), lr=config["Learning Rate"], betas=(0.9, 0.99))
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state, scheduler_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        lr_scheduler.load_state_dict(scheduler_state)

    train_metrics = Metrics(
        buffer_size=len(training_dataloader),
        mode="Training",
        hyperparam=config,
        device=device)

    val_metrics = Metrics(
        buffer_size=len(validation_dataloader),
        mode="Validation",
        hyperparam=config,
        device=device)


    for epoch in range(1, EPOCHS+1):  # loop over the dataset multiple times
        training_loop(epoch, training_dataloader, model, loss_fn, optimizer, lr_scheduler, train_metrics, device)
        validation_loop(epoch, validation_dataloader, model, loss_fn, val_metrics, device)

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and can be accessed through `session.get_checkpoint()`
        # API in future iterations.
        os.makedirs("model", exist_ok=True)
        torch.save((model.state_dict(), optimizer.state_dict(), lr_scheduler.state_dict()), "model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("model")
        session.report(metrics=val_metrics.get_metrics(epoch), checkpoint=checkpoint)

    train_metrics.close_tensorboard()
    val_metrics.close_tensorboard()


## Creating the evaluation function

In [6]:
def evaluate(test_data, result):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    inpip = InputPipeline(
        transformer=result.config["Pipe Filter"],
        layer_transformer=result.config["Pipe Layer"](threshold=result.config["Pipe Layer_threshold"]))
    if LOAD_DATA_ON_GPU:
        inpip = inpip.to(device)

    test_dataset = POCDataset(
        test_data,
        transform=Sequential(inpip, CenterCrop(size=(480, 480)), Resize(size=(384, 384))),
        target_transform=Sequential(CenterCrop(size=(480, 480)), Resize(size=(384, 384))),
        negative_mining=False,
        load_on_gpu=LOAD_DATA_ON_GPU)
    
    if LOAD_DATA_ON_GPU:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
    else:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=2*CPUS_PER_TRIAL, pin_memory=True, pin_memory_device=device)

    best_trained_model = result.config["Network"](n_channels=inpip.nb_channel, n_classes=2).to(device)

    checkpoint_path = os.path.join(result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, _, _ = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    test_metrics = EvaluationMetrics(
        buffer_size=len(evaluation_dataloader),
        hyperparam=result.config,
        epochs=result.metrics["Epoch"],
        device=device)

    evaluation_loop(dataloader=evaluation_dataloader, model=best_trained_model, metric=test_metrics, device=device)


# Running the training

## Loading data for training

In [7]:
data_reader = POCDataReader(root_dir="../data/POC", load_on_gpu=False, verbose=True)
train_data, val_data, test_data = data_reader.split([0.7, 0.1, 0.2])
data_augment_(train_data, n=NUM_AUGMENT, load_on_gpu=False, verbose=True)

Loading dataset into RAM:   0%|          | 0/2744 [00:00<?, ?it/s]

	- Loading done, RAM used: 4.60GiB / free: 113.33GiB / total: 125.40GiB
	- Got a total of 2744 images.


Expending the dataset 1 more times:   0%|          | 0/1920 [00:00<?, ?it/s]

	- Augmentation done, RAM used: 7.32GiB / free: 110.61GiB / total: 125.40GiB
	- Got 1920 new images and a total of 3840 images.


## Configuring the Tuner with a Scheduler and a Search Algorithm

In [8]:
scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=2, reduction_factor=2)
# search_algo = HyperOptSearch()
search_algo = OptunaSearch()

tune_config = tune.TuneConfig(
    metric="CrackIoU",
    mode="max",
    num_samples=NUM_SAMPLES,
    scheduler=scheduler,
    search_alg=search_algo,
    max_concurrent_trials=4,
)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train, train_data=train_data, val_data=val_data),
        resources={"cpu": CPUS_PER_TRIAL, "gpu": GPUS_PER_TRIAL}),
    tune_config=tune_config,
    param_space=search_space,
    run_config=RunConfig(
        local_dir="~/POC-Project/ray_results",
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="CrackIoU",
            checkpoint_score_order="max",
            checkpoint_at_end=False)))

## Running the Tuner

In [None]:
results = tuner.fit()

2023-04-27 11:42:08,207	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
  return ot.distributions.LogUniformDistribution(
  return ot.distributions.UniformDistribution(
[32m[I 2023-04-27 11:42:21,522][0m A new study created in memory with name: optuna[0m


0,1
Current time:,2023-04-27 11:51:57
Running for:,00:09:35.84
Memory:,41.4/125.4 GiB

Trial name,# failures,error file
train_08b85916,1,"/home/pirl/POC-Project/ray_results/train_2023-04-27_11-41-57/train_08b85916_1_Batch_Size=4,Learning_Rate=0.0001,Loss_Combiner=class_loss_combination_loss_BorderedLoss,Loss_Combiner_ratio=0.54_2023-04-27_11-42-21/error.txt"
train_eca6c353,1,"/home/pirl/POC-Project/ray_results/train_2023-04-27_11-41-57/train_eca6c353_2_Batch_Size=4,Learning_Rate=0.0006,Loss_Combiner=class_loss_combination_loss_BorderedLoss,Loss_Combiner_ratio=0.84_2023-04-27_11-42-25/error.txt"
train_dfda29b0,1,"/home/pirl/POC-Project/ray_results/train_2023-04-27_11-41-57/train_dfda29b0_3_Batch_Size=4,Learning_Rate=0.0000,Loss_Combiner=class_loss_combination_loss_BorderedLoss,Loss_Combiner_ratio=0.36_2023-04-27_11-46-17/error.txt"
train_2ce3e7d3,1,"/home/pirl/POC-Project/ray_results/train_2023-04-27_11-41-57/train_2ce3e7d3_4_Batch_Size=4,Learning_Rate=0.0000,Loss_Combiner=class_loss_combination_loss_BorderedLoss,Loss_Combiner_ratio=0.61_2023-04-27_11-46-21/error.txt"

Trial name,status,loc,Learning Rate,Loss Combiner_ratio,Loss Pixel_gamma,Loss Pixel_weight,Pipe Layer_threshold
train_92b4549e,RUNNING,141.223.108.122:36942,0.000326549,0.617568,3.77372,0.0795479,1.05296
train_dd276671,RUNNING,141.223.108.122:37019,7.26246e-06,0.625032,1.15126,0.764014,3.06962
train_7532e275,PENDING,,0.000458685,0.794631,0.74034,0.731942,1.26741
train_08b85916,ERROR,141.223.108.122:36499,5.07095e-05,0.542386,3.97579,0.247624,1.22112
train_eca6c353,ERROR,141.223.108.122:36584,0.000603518,0.845697,3.6948,0.326855,3.95367
train_dfda29b0,ERROR,141.223.108.122:36729,2.00492e-06,0.361901,0.0504258,0.965534,0.569752
train_2ce3e7d3,ERROR,141.223.108.122:36806,2.54547e-06,0.619572,2.39281,0.534353,0.597239


2023-04-27 11:46:16,643	ERROR trial_runner.py:1062 -- Trial train_08b85916: Error processing event.
ray.exceptions.RayTaskError(KeyError): [36mray::ImplicitFunc.train()[39m (pid=36499, ip=141.223.108.122, repr=train)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 398, i

Trial name,date,experiment_id,hostname,node_ip,pid,timestamp,trial_id
train_08b85916,2023-04-27_11-42-25,1b8a5914f5e840d388f461ebaf61e94f,pirl-PowerEdge-T640,141.223.108.122,36499,1682563345,08b85916
train_2ce3e7d3,2023-04-27_11-46-24,a1dba36a845643d6bc320c6d23ae6c66,pirl-PowerEdge-T640,141.223.108.122,36806,1682563584,2ce3e7d3
train_dfda29b0,2023-04-27_11-46-20,1b90afffccbe4859a73ab30fa82e055c,pirl-PowerEdge-T640,141.223.108.122,36729,1682563580,dfda29b0
train_eca6c353,2023-04-27_11-42-28,09fac4c6f281406088583dd30abe0e33,pirl-PowerEdge-T640,141.223.108.122,36584,1682563348,eca6c353


2023-04-27 11:46:20,599	ERROR trial_runner.py:1062 -- Trial train_eca6c353: Error processing event.
ray.exceptions.RayTaskError(KeyError): [36mray::ImplicitFunc.train()[39m (pid=36584, ip=141.223.108.122, repr=train)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 398, i

# Evaluating the best Results

In [None]:
best_result = results.get_best_result(metric="CrackIoU", mode="max", scope="all")  # Get best result object
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(best_result.metrics["Loss"]))
print("Best trial final validation CrackIoU: {}".format(best_result.metrics["CrackIoU"]))

for result in results:
    evaluate(test_data=test_data, result=result)