<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setting-up-imports" data-toc-modified-id="Setting-up-imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setting up imports</a></span></li><li><span><a href="#Setting-up-Constant-Hyperparameters" data-toc-modified-id="Setting-up-Constant-Hyperparameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setting up Constant Hyperparameters</a></span></li><li><span><a href="#Setting-up-Parameters-and-Functions-for-Training" data-toc-modified-id="Setting-up-Parameters-and-Functions-for-Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Setting up Parameters and Functions for Training</a></span><ul class="toc-item"><li><span><a href="#Hyperparameters-Search-Space" data-toc-modified-id="Hyperparameters-Search-Space-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Hyperparameters Search Space</a></span></li><li><span><a href="#Creating-the-training-function" data-toc-modified-id="Creating-the-training-function-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Creating the training function</a></span></li><li><span><a href="#Creating-the-evaluation-function" data-toc-modified-id="Creating-the-evaluation-function-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Creating the evaluation function</a></span></li></ul></li><li><span><a href="#Running-the-training" data-toc-modified-id="Running-the-training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Running the training</a></span><ul class="toc-item"><li><span><a href="#Loading-data-for-training" data-toc-modified-id="Loading-data-for-training-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Loading data for training</a></span></li><li><span><a href="#Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm" data-toc-modified-id="Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Configuring the Tuner with a Scheduler and a Search Algorithm</a></span></li><li><span><a href="#Running-the-Tuner" data-toc-modified-id="Running-the-Tuner-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Running the Tuner</a></span></li></ul></li><li><span><a href="#Evaluating-the-best-Results" data-toc-modified-id="Evaluating-the-best-Results-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Evaluating the best Results</a></span></li></ul></div>

# Setting up imports

In [None]:
import os

import torch
from torch.nn import CrossEntropyLoss
from torch.nn.functional import normalize
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
from torchvision.transforms import GaussianBlur
from torchvision.transforms.functional import invert

import ray
from ray import tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch


from Dataset import POCDataReader, data_augment_, POCDataset
from metrics import Metrics, EvaluationMetrics
from models import UNet
from loss import *
from pipelines import *
from train import training_loop, validation_loop
from train_tqdm import evaluation_loop


# Setting up Constant Hyperparameters

In [None]:
EPOCHS = 12
NUM_SAMPLES = 30
NUM_MODEL_TEST = 5

NUM_AUGMENT = 1

LOAD_DATA_ON_GPU = True
GPUS_PER_TRIAL = 1
CPUS_PER_TRIAL = 20

##### Selecting Cuda device

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Setting up Parameters and Functions for Training

##### Setting up the loss function sampler

In [None]:
def loss_fn_sampler():
    pixel_losses_list = [
        CrossEntropyLoss(weight=torch.tensor([.3, .7])), 
        FocalLoss(weight=torch.tensor([.3, .7]), gamma=2)
    ]
    volume_losses_list = [
        JaccardLoss(),
        TverskyLoss(alpha=0.3, beta=0.7),
        FocalTverskyLoss(alpha=0.3, beta=0.7, gamma=2)
    ]
    loss_combinators_list = [ CombinedLoss, BorderedLoss ]

    complete_list = pixel_losses_list + volume_losses_list

    for combinator in loss_combinators_list:
        complete_list += [combinator(loss1, loss2) for loss1 in pixel_losses_list for loss2 in volume_losses_list]

    return complete_list


##### Setting up the input Pipeline sampler

In [None]:
def input_pip_sampler():
    transformers_list = [
        [invert, normalize],
        [normalize],
    ]
    layer_transformers_list = [
        None,
        LaplacianFilter(),
        SobelFilter(),
    ]

    return [InputPipeline(transformer, layer_transformer) for transformer in transformers_list for layer_transformer in layer_transformers_list]


## Hyperparameters Search Space

In [None]:
search_space = {
    "lr": 1e-4, #tune.qloguniform(1e-5, 1e-2, 5e-6),
    "batch_size": 4, #tune.qrandint(2, 8, 2),
    "NM": tune.choice([True, False]),
    "SL": tune.choice([True, False]),
    "loss_fn": tune.choice(loss_fn_sampler()),
    "input_pipeline": tune.choice(input_pip_sampler())
}

## Creating the training function

In [None]:
def train(config, train_data, val_data):

    device = "cuda" if torch.cuda.is_available() else "cpu"

    train_dataset = POCDataset(
        train_data,
        transform=config["input_pipeline"],
        target_transform= GaussianBlur(kernel_size=3, sigma=0.7) if config["SL"] else None,
        negative_mining=config["NM"])

    if LOAD_DATA_ON_GPU:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["batch_size"]),
            sampler=train_dataset.sampler)
    else:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["batch_size"]),
            sampler=train_dataset.sampler,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    val_dataset = POCDataset(val_data, transform=config["input_pipeline"], target_transform=None, negative_mining=False)
    
    if LOAD_DATA_ON_GPU:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["batch_size"]),
            shuffle=True)
    else:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["batch_size"]),
            shuffle=True,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    model = UNet(n_channels=config["input_pipeline"].nb_channel(), n_classes=2, bilinear=True, crop=False)
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    loss_fn = config["loss_fn"].to(device)
    optimizer = Adam(model.parameters(), lr=config["lr"], betas=(0.9, 0.99))
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS//2)

    # To restore a checkpoint, use `session.get_checkpoint()`.
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state, scheduler_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        lr_scheduler.load_state_dict(scheduler_state)

    train_metrics = Metrics(
        buffer_size=len(training_dataloader),
        mode="Training",
        model_name=model.__class__.__name__,
        loss_name=loss_fn.__class__.__name__,
        opt_name=optimizer.__class__.__name__,
        pip_name=str(config["input_pipeline"]),
        batch_size=config["batch_size"],
        learning_rate=config["lr"],
        negative_mining=config["NM"],
        soft_labels=config["SL"],
        device=device)

    val_metrics = Metrics(
        buffer_size=len(validation_dataloader),
        mode="Validation",
        model_name=model.__class__.__name__,
        loss_name=loss_fn.__class__.__name__,
        opt_name=optimizer.__class__.__name__,
        pip_name=str(config["input_pipeline"]),
        batch_size=config["batch_size"],
        learning_rate=config["lr"],
        negative_mining=config["NM"],
        soft_labels=config["SL"],
        device=device)


    for epoch in range(1, EPOCHS+1):  # loop over the dataset multiple times
        training_loop(epoch, training_dataloader, model, loss_fn, optimizer, lr_scheduler, train_metrics, device)
        validation_loop(epoch, validation_dataloader, model, loss_fn, val_metrics, device)

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and can be accessed through `session.get_checkpoint()`
        # API in future iterations.
        os.makedirs("model", exist_ok=True)
        torch.save((model.state_dict(), optimizer.state_dict(), lr_scheduler.state_dict()), "model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("model")
        session.report(metrics=val_metrics.get_metrics(epoch), checkpoint=checkpoint)

    train_metrics.close_tensorboard()
    val_metrics.close_tensorboard()


## Creating the evaluation function

In [None]:
def evaluate(test_data, best_result):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    test_dataset = POCDataset(test_data, transform=normalize, target_transform=None, negative_mining=False)
    
    if LOAD_DATA_ON_GPU:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
    else:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=20, pin_memory=True, pin_memory_device=device)

    best_trained_model = UNet(n_channels=config["input_pipeline"].nb_channel(), n_classes=2, bilinear=True, crop=False).to(device)

    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, _, _ = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    test_metrics = EvaluationMetrics(
        buffer_size=len(evaluation_dataloader),
        model_name=best_trained_model.__class__.__name__,
        loss_name=best_result.config["loss_fn"].__class__.__name__,
        opt_name="Adam",
        pip_name=str(best_result.config["input_pipeline"]),
        epochs=EPOCHS,
        batch_size=best_result.config["batch_size"],
        learning_rate=best_result.config["lr"],
        negative_mining=best_result.config["NM"],
        soft_labels=best_result.config["SL"],
        device=device)

    evaluation_loop(dataloader=evaluation_dataloader, model=best_trained_model, metric=test_metrics, device=device)

def evaluate_df(test_data, results_df):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    test_dataset = POCDataset(test_data, transform=normalize, target_transform=None, negative_mining=False)

    if LOAD_DATA_ON_GPU:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
    else:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=20, pin_memory=True, pin_memory_device=device)

    results_df.sort_values("CrackIoU", ascending=False, inplace=True)
    for index, res in results_df.head(NUM_MODEL_TEST).iterrows():

        trained_model = UNet(n_channels=config["input_pipeline"].nb_channel(), n_classes=2, bilinear=True, crop=False).to(device)

        checkpoint_path = os.path.join(res["logdir"], "model/checkpoint.pt")
        model_state, _, _ = torch.load(checkpoint_path)
        trained_model.load_state_dict(model_state)

        test_metrics = EvaluationMetrics(
            buffer_size=len(evaluation_dataloader),
            model_name=trained_model.__class__.__name__,
            loss_name=res["config/loss_fn"],
            opt_name="Adam",
            pip_name=str(best_result.config["input_pipeline"]),
            epochs=res["Epoch"],
            batch_size=res["config/batch_size"],
            learning_rate=res["config/lr"],
            negative_mining=res["config/NM"],
            soft_labels=res["config/SL"],
            device=device)

        evaluation_loop(dataloader=evaluation_dataloader, model=trained_model, metric=test_metrics, device=device)


# Running the training

## Loading data for training

In [9]:
data_reader = POCDataReader(root_dir="../data", load_on_gpu=LOAD_DATA_ON_GPU)
train_data, val_data, test_data = data_reader.split([0.7, 0.1, 0.2])
data_augment_(train_data, n=NUM_AUGMENT, load_on_gpu=LOAD_DATA_ON_GPU)

Loading dataset into GPU:   0%|          | 0/2744 [00:00<?, ?it/s]

	- Loading done, GPU memory used: 3.14GiB / free: 17.97GiB / total: 22.17GiB
	- Got a total of 2744 images.


Expending the dataset 1 more times:   0%|          | 0/1920 [00:00<?, ?it/s]

	- Augmentation done, GPU memory used: 5.34GiB / free: 15.24GiB / total: 22.17GiB
	- Got 1920 new images and a total of 3840 images.


## Configuring the Tuner with a Scheduler and a Search Algorithm

In [10]:
scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=2, reduction_factor=2)
search_algo = HyperOptSearch()

tune_config = tune.TuneConfig(
    metric="CrackIoU",
    mode="max",
    num_samples=NUM_SAMPLES,
    scheduler=scheduler,
    search_alg=search_algo)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train, train_data=train_data, val_data=val_data),
        resources={"cpu": CPUS_PER_TRIAL, "gpu": GPUS_PER_TRIAL}),
    tune_config=tune_config,
    param_space=search_space)

## Running the Tuner

In [11]:
results = tuner.fit()

2023-03-17 11:38:56,604	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Current time:,2023-03-17 11:39:31
Running for:,00:00:20.71
Memory:,24.8/125.4 GiB

Trial name,# failures,error file
train_9de30aff,1,"/home/pirl/ray_results/train_2023-03-17_11-38-44/train_9de30aff_1_NM=True,SL=False,batch_size=4,input_pipeline=InputPipeline_invert_normalize,loss_fn=CrossEntropyLoss,lr=0.0001_2023-03-17_11-39-10/error.txt"
train_13042d22,1,"/home/pirl/ray_results/train_2023-03-17_11-38-44/train_13042d22_2_NM=False,SL=True,batch_size=4,input_pipeline=InputPipeline_invert_normalize_SobelFilter,loss_fn=FocalLoss,lr=0.00_2023-03-17_11-39-13/error.txt"

Trial name,status,loc,NM,SL,batch_size,input_pipeline,loss_fn,lr
train_a01bfd90,RUNNING,141.223.108.122:39044,False,False,4,InputPipeline(n_2230,(B:FocalLoss+V:_be20,0.0001
train_6ca31522,RUNNING,141.223.108.122:39123,False,False,4,InputPipeline(n_2410,FocalLoss,0.0001
train_813fefa1,PENDING,,False,True,4,InputPipeline(i_3370,FocalLoss,0.0001
train_9de30aff,ERROR,141.223.108.122:38866,True,False,4,InputPipeline(i_c3a0,CrossEntropyLoss(),0.0001
train_13042d22,ERROR,141.223.108.122:38953,False,True,4,InputPipeline(i_9960,FocalLoss,0.0001


2023-03-17 11:39:21,363	ERROR trial_runner.py:1062 -- Trial train_9de30aff: Error processing event.
ray.exceptions.RayTaskError(AttributeError): [36mray::ImplicitFunc.train()[39m (pid=38866, ip=141.223.108.122, repr=train)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 

Trial name,date,experiment_id,hostname,node_ip,pid,timestamp,trial_id
train_13042d22,2023-03-17_11-39-16,1d4cd90168a24273b084ebe8e57cd312,pirl-PowerEdge-T640,141.223.108.122,38953,1679020756,13042d22
train_9de30aff,2023-03-17_11-39-13,227a0c660edd4c0e868552ea874089e4,pirl-PowerEdge-T640,141.223.108.122,38866,1679020753,9de30aff


2023-03-17 11:39:24,133	ERROR trial_runner.py:1062 -- Trial train_13042d22: Error processing event.
ray.exceptions.RayTaskError(AttributeError): [36mray::ImplicitFunc.train()[39m (pid=38953, ip=141.223.108.122, repr=train)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/home/pirl/anaconda3/envs/POC-env/lib/python3.10/site-packages/ray/tune/trainable/util.py", line 

# Evaluating the best Results

In [None]:
best_result = results.get_best_result(metric="CrackIoU", mode="max", scope="all")  # Get best result object
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(best_result.metrics["Loss"]))
print("Best trial final validation CrackIoU: {}".format(best_result.metrics["CrackIoU"]))

# evaluate(test_data=test_data, best_result=best_result)

results_df = results.get_dataframe(filter_metric="CrackIoU", filter_mode="max")  # Get all trials by CrackIoU
results_df.sort_values("CrackIoU", ascending=False, inplace=True)

evaluate_df(test_data=test_data, results_df=results_df)