<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setting-up-imports" data-toc-modified-id="Setting-up-imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setting up imports</a></span></li><li><span><a href="#Setting-up-Constant-Hyperparameters" data-toc-modified-id="Setting-up-Constant-Hyperparameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setting up Constant Hyperparameters</a></span></li><li><span><a href="#Setting-up-Parameters-and-Functions-for-Training" data-toc-modified-id="Setting-up-Parameters-and-Functions-for-Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Setting up Parameters and Functions for Training</a></span><ul class="toc-item"><li><span><a href="#Hyperparameters-Search-Space" data-toc-modified-id="Hyperparameters-Search-Space-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Hyperparameters Search Space</a></span></li><li><span><a href="#Creating-the-training-function" data-toc-modified-id="Creating-the-training-function-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Creating the training function</a></span></li><li><span><a href="#Creating-the-evaluation-function" data-toc-modified-id="Creating-the-evaluation-function-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Creating the evaluation function</a></span></li></ul></li><li><span><a href="#Running-the-training" data-toc-modified-id="Running-the-training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Running the training</a></span><ul class="toc-item"><li><span><a href="#Loading-data-for-training" data-toc-modified-id="Loading-data-for-training-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Loading data for training</a></span></li><li><span><a href="#Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm" data-toc-modified-id="Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Configuring the Tuner with a Scheduler and a Search Algorithm</a></span></li><li><span><a href="#Running-the-Tuner" data-toc-modified-id="Running-the-Tuner-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Running the Tuner</a></span></li></ul></li><li><span><a href="#Evaluating-the-best-Results" data-toc-modified-id="Evaluating-the-best-Results-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Evaluating the best Results</a></span></li></ul></div>

# Setting up imports

In [1]:
import os

import torch
from torch.nn import CrossEntropyLoss
from torch.nn.functional import normalize
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
from torchvision.transforms import GaussianBlur
from torchvision.transforms.functional import invert

import ray
from ray import tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch


from Dataset import POCDataReader, data_augment_, POCDataset
from metrics import Metrics, EvaluationMetrics
from models import UNet
from loss import *
from pipelines import *
from train import training_loop, validation_loop
from train_tqdm import evaluation_loop


# Setting up Constant Hyperparameters

In [6]:
EPOCHS = 15
NUM_SAMPLES = 30
NUM_MODEL_TEST = 10

NUM_AUGMENT = 1

LOAD_DATA_ON_GPU = True
GPUS_PER_TRIAL = 1
CPUS_PER_TRIAL = 20

##### Selecting Cuda device

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Setting up Parameters and Functions for Training

##### Setting up the loss function sampler

In [None]:
def loss_fn_sampler():
    pixel_losses_list = [
        CrossEntropyLoss(weight=torch.tensor([.3, .7])), 
        FocalLoss(weight=torch.tensor([.3, .7]), gamma=2)
    ]
    volume_losses_list = [
        JaccardLoss(),
        TverskyLoss(alpha=0.3, beta=0.7),
        FocalTverskyLoss(alpha=0.3, beta=0.7, gamma=2)
    ]
    loss_combinators_list = [ CombinedLoss, BorderedLoss ]

    complete_list = pixel_losses_list + volume_losses_list

    for combinator in loss_combinators_list:
        complete_list += [combinator(loss1, loss2) for loss1 in pixel_losses_list for loss2 in volume_losses_list]

    return complete_list


## Hyperparameters Search Space

In [2]:
search_space = {
    "Network": UNet,
    "Optimizer": Adam,
    
    "Learning Rate": 1e-4,   #tune.qloguniform(1e-5, 1e-2, 5e-6),
    "Batch Size": 4,         #tune.qrandint(2, 8, 2),


    "Pixel Loss": tune.choice([CrossEntropyLoss(weight=torch.tensor([.3, .7])), FocalLoss(weight=torch.tensor([.3, .7]), gamma=2)]),
    "Volume Loss": tune.choice([JaccardLoss(), TverskyLoss(alpha=0.3, beta=0.7), FocalTverskyLoss(alpha=0.3, beta=0.7, gamma=2)]),
    "Combine Loss": tune.choice([CombinedLoss, BorderedLoss, PixelLoss, VolumeLoss]),
    
    "Negative Mining": tune.choice([True, False]),
    "Smooth Labeling": tune.choice([True, False]),

    "Input Filter": tune.choice([None, invert]),
    "Input Layer": tune.choice([None, LaplacianFilter(), SobelFilter()]),   #, DINOFilter()]),
}

Using cache found in /home/pirl/.cache/torch/hub/facebookresearch_dino_main


## Creating the training function

In [8]:
def train(config, train_data, val_data):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    inpip = InputPipeline(
        transformer=[normalize, config["Input Filter"]] if config["Input Filter"] is not None else normalize, 
        layer_transformer=config["Input Layer"])
    if LOAD_DATA_ON_GPU:
        inpip = inpip.to(device)

    train_dataset = POCDataset(
        train_data,
        transform=inpip,
        target_transform= GaussianBlur(kernel_size=3, sigma=0.7) if config["Smooth Labeling"] else None,
        negative_mining=config["Negative Mining"])
#     train_dataset.precompute_transform(LOAD_DATA_ON_GPU)

    if LOAD_DATA_ON_GPU:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["Batch Size"]),
            sampler=train_dataset.sampler)
    else:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["Batch Size"]),
            sampler=train_dataset.sampler,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    val_dataset = POCDataset(val_data, transform=inpip, target_transform=None, negative_mining=False)
#     val_dataset.precompute_transform(LOAD_DATA_ON_GPU)
    
    if LOAD_DATA_ON_GPU:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["Batch Size"]),
            shuffle=True)
    else:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["Batch Size"]),
            shuffle=True,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    model = config["Network"](n_channels=inpip.nb_channel, n_classes=2, bilinear=True, crop=False)
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    loss_fn = config["Combine Loss"](config["Pixel Loss"], config["Volume Loss"]).to(device)
    optimizer = config["Optimizer"](model.parameters(), lr=config["Learning Rate"], betas=(0.9, 0.99))
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS//2)

    # To restore a checkpoint, use `session.get_checkpoint()`.
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state, scheduler_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        lr_scheduler.load_state_dict(scheduler_state)

    train_metrics = Metrics(
        buffer_size=len(training_dataloader),
        mode="Training",
        hyperparam=config,
        device=device)

    val_metrics = Metrics(
        buffer_size=len(validation_dataloader),
        mode="Validation",
        hyperparam=config,
        device=device)


    for epoch in range(1, EPOCHS+1):  # loop over the dataset multiple times
        training_loop(epoch, training_dataloader, model, loss_fn, optimizer, lr_scheduler, train_metrics, device)
        validation_loop(epoch, validation_dataloader, model, loss_fn, val_metrics, device)

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and can be accessed through `session.get_checkpoint()`
        # API in future iterations.
        os.makedirs("model", exist_ok=True)
        torch.save((model.state_dict(), optimizer.state_dict(), lr_scheduler.state_dict()), "model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("model")
        session.report(metrics=val_metrics.get_metrics(epoch), checkpoint=checkpoint)

    train_metrics.close_tensorboard()
    val_metrics.close_tensorboard()


## Creating the evaluation function

In [9]:
def evaluate(test_data, best_result):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    inpip = InputPipeline(
        transformer=[normalize, best_result.config["Input Filter"]] if best_result.config["Input Filter"] is not None else normalize, 
        layer_transformer=best_result.config["Input Layer"])
    if LOAD_DATA_ON_GPU:
        inpip = inpip.to(device)

    test_dataset = POCDataset(test_data, transform=inpip, target_transform=None, negative_mining=False)
    
    if LOAD_DATA_ON_GPU:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
    else:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=20, pin_memory=True, pin_memory_device=device)

    best_trained_model = best_result.config["Network"](n_channels=inpip.nb_channel, n_classes=2, bilinear=True, crop=False).to(device)

    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, _, _ = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)
    
    loss_fn = best_result.config["Combine Loss"](best_result.config["Pixel Loss"], best_result.config["Volume Loss"]).to(device)

    test_metrics = EvaluationMetrics(
        buffer_size=len(evaluation_dataloader),
        hyperparam=best_result.config,
        device=device)

    evaluation_loop(dataloader=evaluation_dataloader, model=best_trained_model, metric=test_metrics, device=device)

def evaluate_df(test_data, results_df):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    results_df.sort_values("CrackIoU", ascending=False, inplace=True)
    for index, res in results_df.head(NUM_MODEL_TEST).iterrows():
        
        inpip = InputPipeline(
            transformer=eval(res["config/Input Filter"]),
            layer_transformer=eval(res["config/Input Layer"]))
        if LOAD_DATA_ON_GPU:
            inpip = inpip.to(device)
        
        test_dataset = POCDataset(test_data, transform=inpip, target_transform=None, negative_mining=False)

        if LOAD_DATA_ON_GPU:
            evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
        else:
            evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=20, pin_memory=True, pin_memory_device=device)

        trained_model = eval(res["config/Network"])(n_channels=inpip.nb_channel, n_classes=2, bilinear=True, crop=False).to(device)

        checkpoint_path = os.path.join(res["logdir"], "model/checkpoint.pt")
        model_state, _, _ = torch.load(checkpoint_path)
        trained_model.load_state_dict(model_state)

        hyperparam = {
            "Network": str(res["config/Network"]),
            "Optimizer": str(res["config/Optimizer"]),
            "Learning Rate": str(res["config/Learning Rate"]),
            "Batch Size": str(res["config/Batch Size"]),
            "Pixel Loss": str(res["config/Pixel Loss"]),
            "Volume Loss": str(res["config/Volume Loss"]),
            "Combine Loss": str(res["config/Combine Loss"]),
            "Negative Mining": str(res["config/Negative Mining"]),
            "Smooth Labeling": str(res["config/Smooth Labeling"]),
            "Input Filter": str(res["config/Input Filter"]),
            "Input Layer": str(res["config/Input Layer"]),
        }
        
        test_metrics = EvaluationMetrics(
            buffer_size=len(evaluation_dataloader),
            hyperparam=hyperparam,
            device=device)

        evaluation_loop(dataloader=evaluation_dataloader, model=trained_model, metric=test_metrics, device=device)


SyntaxError: invalid syntax (2132963852.py, line 66)

# Running the training

## Loading data for training

In [10]:
data_reader = POCDataReader(root_dir="../data", load_on_gpu=LOAD_DATA_ON_GPU)
train_data, val_data, test_data = data_reader.split([0.7, 0.1, 0.2])
data_augment_(train_data, n=NUM_AUGMENT, load_on_gpu=LOAD_DATA_ON_GPU)

Loading dataset into GPU:   0%|          | 0/2744 [00:00<?, ?it/s]

	- Loading done, GPU memory used: 3.14GiB / free: 17.97GiB / total: 22.17GiB
	- Got a total of 2744 images.


Expending the dataset 1 more times:   0%|          | 0/1920 [00:00<?, ?it/s]

	- Augmentation done, GPU memory used: 5.34GiB / free: 15.24GiB / total: 22.17GiB
	- Got 1920 new images and a total of 3840 images.


## Configuring the Tuner with a Scheduler and a Search Algorithm

In [11]:
scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=2, reduction_factor=2)
search_algo = HyperOptSearch()

tune_config = tune.TuneConfig(
    metric="CrackIoU",
    mode="max",
    num_samples=NUM_SAMPLES,
    scheduler=scheduler,
    search_alg=search_algo)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train, train_data=train_data, val_data=val_data),
        resources={"cpu": CPUS_PER_TRIAL, "gpu": GPUS_PER_TRIAL}),
    tune_config=tune_config,
    param_space=search_space)

## Running the Tuner

In [12]:
results = tuner.fit()

2023-03-21 10:47:11,831	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Current time:,2023-03-21 10:57:14
Running for:,00:09:46.40
Memory:,32.2/125.4 GiB

Trial name,# failures,error file
train_eb3587eb,1,"/home/pirl/ray_results/train_2023-03-21_10-46-58/train_eb3587eb_1_Batch_Size=4,Combine_Loss=class_loss_loss_BorderedLoss,Input_Filter=None,Input_Layer=None,Learning_Rate=0.0001,Ne_2023-03-21_10-47-28/error.txt"
train_9424845c,1,"/home/pirl/ray_results/train_2023-03-21_10-46-58/train_9424845c_2_Batch_Size=4,Combine_Loss=class_loss_loss_BorderedLoss,Input_Filter=function_invert_at_0x7f5ab4363250,Input_Layer_2023-03-21_10-47-31/error.txt"

Trial name,status,loc,Batch Size,Combine Loss,Input Filter,Input Layer,Learning Rate,Negative Mining,Network,Optimizer,Pixel Loss,Smooth Labeling,Volume Loss
train_94231a90,RUNNING,141.223.108.122:31130,4,<class 'loss.lo_9eb0,<function inver_3250,LaplacianFilter,0.0001,True,<class 'models._52b0,<class 'torch.o_7f60,FocalLoss,False,TverskyLoss
train_79af376c,RUNNING,141.223.108.122:31214,4,<class 'loss.lo_9eb0,,,0.0001,False,<class 'models._52b0,<class 'torch.o_7f60,CrossEntropyLoss(),True,TverskyLoss
train_8c59e872,PENDING,,4,<class 'loss.lo_a270,<function inver_3250,,0.0001,False,<class 'models._52b0,<class 'torch.o_7f60,FocalLoss,True,JaccardLoss
train_eb3587eb,ERROR,141.223.108.122:30760,4,<class 'loss.lo_9af0,,,0.0001,True,<class 'models._52b0,<class 'torch.o_7f60,FocalLoss,True,JaccardLoss
train_9424845c,ERROR,141.223.108.122:30848,4,<class 'loss.lo_9af0,<function inver_3250,SobelFilter,0.0001,True,<class 'models._52b0,<class 'torch.o_7f60,FocalLoss,False,TverskyLoss


2023-03-21 10:53:57,949	ERROR trial_runner.py:1062 -- Trial train_9424845c: Error processing event.
ray.exceptions.RayTaskError(KeyError): [36mray::ImplicitFunc.train()[39m (pid=30848, ip=141.223.108.122, repr=train)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 398, i

Trial name,date,experiment_id,hostname,node_ip,pid,timestamp,trial_id
train_9424845c,2023-03-21_10-47-34,1c1dc2aa18254557b263dcf9664be9fb,pirl-PowerEdge-T640,141.223.108.122,30848,1679363254,9424845c
train_eb3587eb,2023-03-21_10-47-31,82d5981b5d93461a9d63233e07568837,pirl-PowerEdge-T640,141.223.108.122,30760,1679363251,eb3587eb


2023-03-21 10:54:13,202	ERROR trial_runner.py:1062 -- Trial train_eb3587eb: Error processing event.
ray.exceptions.RayTaskError(KeyError): [36mray::ImplicitFunc.train()[39m (pid=30760, ip=141.223.108.122, repr=train)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 398, i

KeyboardInterrupt: 

# Evaluating the best Results

In [None]:
best_result = results.get_best_result(metric="CrackIoU", mode="max", scope="all")  # Get best result object
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(best_result.metrics["Loss"]))
print("Best trial final validation CrackIoU: {}".format(best_result.metrics["CrackIoU"]))

# evaluate(test_data=test_data, best_result=best_result)

results_df = results.get_dataframe(filter_metric="CrackIoU", filter_mode="max")  # Get all trials by CrackIoU
results_df.sort_values("CrackIoU", ascending=False, inplace=True)

evaluate_df(test_data=test_data, results_df=results_df)