<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setting-up-imports" data-toc-modified-id="Setting-up-imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setting up imports</a></span></li><li><span><a href="#Setting-up-Constant-Hyperparameters" data-toc-modified-id="Setting-up-Constant-Hyperparameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setting up Constant Hyperparameters</a></span></li><li><span><a href="#Setting-up-Parameters-and-Functions-for-Training" data-toc-modified-id="Setting-up-Parameters-and-Functions-for-Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Setting up Parameters and Functions for Training</a></span><ul class="toc-item"><li><span><a href="#Hyperparameters-Search-Space" data-toc-modified-id="Hyperparameters-Search-Space-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Hyperparameters Search Space</a></span></li><li><span><a href="#Creating-the-training-function" data-toc-modified-id="Creating-the-training-function-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Creating the training function</a></span></li><li><span><a href="#Creating-the-evaluation-function" data-toc-modified-id="Creating-the-evaluation-function-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Creating the evaluation function</a></span></li></ul></li><li><span><a href="#Running-the-training" data-toc-modified-id="Running-the-training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Running the training</a></span><ul class="toc-item"><li><span><a href="#Loading-data-for-training" data-toc-modified-id="Loading-data-for-training-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Loading data for training</a></span></li><li><span><a href="#Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm" data-toc-modified-id="Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Configuring the Tuner with a Scheduler and a Search Algorithm</a></span></li><li><span><a href="#Running-the-Tuner" data-toc-modified-id="Running-the-Tuner-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Running the Tuner</a></span></li></ul></li><li><span><a href="#Evaluating-the-best-Results" data-toc-modified-id="Evaluating-the-best-Results-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Evaluating the best Results</a></span></li></ul></div>

# Setting up imports

In [1]:
import os

import torch
from torch.nn import CrossEntropyLoss
from torch.nn.functional import normalize
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
from torchvision.transforms import GaussianBlur
from torchvision.transforms.functional import invert

import ray
from ray import tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch


from Dataset import POCDataReader, data_augment_, POCDataset
from metrics import Metrics, EvaluationMetrics
from models import UNet
from loss import *
from pipelines import *
from train import training_loop, validation_loop
from train_tqdm import evaluation_loop


# Setting up Constant Hyperparameters

In [2]:
EPOCHS = 15
NUM_SAMPLES = 30
NUM_MODEL_TEST = 10

NUM_AUGMENT = 1

LOAD_DATA_ON_GPU = True
GPUS_PER_TRIAL = 1
CPUS_PER_TRIAL = 20

##### Selecting Cuda device

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Setting up Parameters and Functions for Training

##### Setting up the loss function sampler

In [4]:
def loss_fn_sampler():
    pixel_losses_list = [
        CrossEntropyLoss(weight=torch.tensor([.3, .7])), 
        FocalLoss(weight=torch.tensor([.3, .7]), gamma=2)
    ]
    volume_losses_list = [
        JaccardLoss(),
        TverskyLoss(alpha=0.3, beta=0.7),
        FocalTverskyLoss(alpha=0.3, beta=0.7, gamma=2)
    ]
    loss_combinators_list = [ CombinedLoss, BorderedLoss ]

    complete_list = pixel_losses_list + volume_losses_list

    for combinator in loss_combinators_list:
        complete_list += [combinator(loss1, loss2) for loss1 in pixel_losses_list for loss2 in volume_losses_list]

    return complete_list


## Hyperparameters Search Space

In [5]:
search_space = {
    "Network": UNet,
    "Optimizer": Adam,
    
    "Learning Rate": 1e-4,   #tune.qloguniform(1e-5, 1e-2, 5e-6),
    "Batch Size": 4,         #tune.qrandint(2, 8, 2),


    "Pixel Loss": tune.choice([CrossEntropyLoss(weight=torch.tensor([.3, .7])), FocalLoss(weight=torch.tensor([.3, .7]), gamma=2)])
    "Volume Loss": tune.choice([JaccardLoss(), TverskyLoss(alpha=0.3, beta=0.7), FocalTverskyLoss(alpha=0.3, beta=0.7, gamma=2)])
    "Combine Loss": tune.choice([CombinedLoss, BorderedLoss, PixelLoss, VolumeLoss])
    
    "Negative Mining": tune.choice([True, False]),
    "Smooth Labeling": tune.choice([True, False]),

    "Input Filter": tune.choice([None, invert]),
    "Input Layer": tune.choice([None, LaplacianFilter(), SobelFilter(), DINOFilter()]),
}

## Creating the training function

In [6]:
def train(config, train_data, val_data):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    inpip = InputPipeline(
        transformer=[normalize, config["Input Filter"]] if config["Input Filter"] is not None else normalize, 
        layer_transformer=config["Input Layer"])
    if LOAD_DATA_ON_GPU:
        inpip = inpip.to(device)

    train_dataset = POCDataset(
        train_data,
        transform=inpip,
        target_transform= GaussianBlur(kernel_size=3, sigma=0.7) if config["Smooth Labeling"] else None,
        negative_mining=config["Negative Mining"])
#     train_dataset.precompute_transform(LOAD_DATA_ON_GPU)

    if LOAD_DATA_ON_GPU:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["Batch Size"]),
            sampler=train_dataset.sampler)
    else:
        training_dataloader = DataLoader(
            train_dataset,
            batch_size=int(config["Batch Size"]),
            sampler=train_dataset.sampler,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    val_dataset = POCDataset(val_data, transform=inpip, target_transform=None, negative_mining=False)
#     val_dataset.precompute_transform(LOAD_DATA_ON_GPU)
    
    if LOAD_DATA_ON_GPU:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["Batch Size"]),
            shuffle=True)
    else:
        validation_dataloader = DataLoader(
            val_dataset,
            batch_size=int(config["Batch Size"]),
            shuffle=True,
            num_workers=CPUS_PER_TRIAL//2,
            pin_memory=True,
            pin_memory_device=device)

    model = config["Network"](n_channels=inpip.nb_channel, n_classes=2, bilinear=True, crop=False)
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    loss_fn = config["Combine Loss"](config["Pixel Loss"], config["Volume Loss"]).to(device)
    optimizer = config["Optimizer"](model.parameters(), lr=config["Learning Rate"], betas=(0.9, 0.99))
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS//2)

    # To restore a checkpoint, use `session.get_checkpoint()`.
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state, scheduler_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        lr_scheduler.load_state_dict(scheduler_state)

    train_metrics = Metrics(
        buffer_size=len(training_dataloader),
        mode="Training",
        hyperparam=config,
        device=device)

    val_metrics = Metrics(
        buffer_size=len(validation_dataloader),
        mode="Validation",
        hyperparam=config,
        device=device)


    for epoch in range(1, EPOCHS+1):  # loop over the dataset multiple times
        training_loop(epoch, training_dataloader, model, loss_fn, optimizer, lr_scheduler, train_metrics, device)
        validation_loop(epoch, validation_dataloader, model, loss_fn, val_metrics, device)

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and can be accessed through `session.get_checkpoint()`
        # API in future iterations.
        os.makedirs("model", exist_ok=True)
        torch.save((model.state_dict(), optimizer.state_dict(), lr_scheduler.state_dict()), "model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("model")
        session.report(metrics=val_metrics.get_metrics(epoch), checkpoint=checkpoint)

    train_metrics.close_tensorboard()
    val_metrics.close_tensorboard()


## Creating the evaluation function

In [7]:
def evaluate(test_data, best_result):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    inpip = InputPipeline(
        transformer=[normalize, best_result.config["Input Filter"]] if best_result.config["Input Filter"] is not None else normalize, 
        layer_transformer=best_result.config["Input Layer"])
    if LOAD_DATA_ON_GPU:
        inpip = inpip.to(device)

    test_dataset = POCDataset(test_data, transform=inpip, target_transform=None, negative_mining=False)
    
    if LOAD_DATA_ON_GPU:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
    else:
        evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=20, pin_memory=True, pin_memory_device=device)

    best_trained_model = best_result.config["Network"](n_channels=inpip.nb_channel, n_classes=2, bilinear=True, crop=False).to(device)

    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, _, _ = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)
    
    loss_fn = best_result.config["Combine Loss"](best_result.config["Pixel Loss"], best_result.config["Volume Loss"]).to(device)

    test_metrics = EvaluationMetrics(
        buffer_size=len(evaluation_dataloader),
        hyperparam=best_result.config,
        device=device)

    evaluation_loop(dataloader=evaluation_dataloader, model=best_trained_model, metric=test_metrics, device=device)

def evaluate_df(test_data, results_df):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    results_df.sort_values("CrackIoU", ascending=False, inplace=True)
    for index, res in results_df.head(NUM_MODEL_TEST).iterrows():
        
        inpip = InputPipeline(
            transformer=eval(res["config/Input Filter"]),
            layer_transformer=eval(res["config/Input Layer"]))
        if LOAD_DATA_ON_GPU:
            inpip = inpip.to(device)
        
        test_dataset = POCDataset(test_data, transform=inpip, target_transform=None, negative_mining=False)

        if LOAD_DATA_ON_GPU:
            evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
        else:
            evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=20, pin_memory=True, pin_memory_device=device)

        trained_model = eval(res["config/Network"])(n_channels=inpip.nb_channel, n_classes=2, bilinear=True, crop=False).to(device)

        checkpoint_path = os.path.join(res["logdir"], "model/checkpoint.pt")
        model_state, _, _ = torch.load(checkpoint_path)
        trained_model.load_state_dict(model_state)
        
        loss_fn_name = str(res["config/loss_fn"])
        
        loss_fn = eval(res["config/Combine Loss"])(eval(res["config/Pixel Loss"]), eval(res["config/Volume Loss"])).to(device)

        test_metrics = EvaluationMetrics(
            buffer_size=len(evaluation_dataloader),
            hyperparam=
            device=device)

        evaluation_loop(dataloader=evaluation_dataloader, model=trained_model, metric=test_metrics, device=device)


# Running the training

## Loading data for training

In [8]:
data_reader = POCDataReader(root_dir="../data", load_on_gpu=LOAD_DATA_ON_GPU)
train_data, val_data, test_data = data_reader.split([0.7, 0.1, 0.2])
data_augment_(train_data, n=NUM_AUGMENT, load_on_gpu=LOAD_DATA_ON_GPU)

Loading dataset into GPU:   0%|          | 0/2744 [00:00<?, ?it/s]

	- Loading done, GPU memory used: 3.14GiB / free: 17.97GiB / total: 22.17GiB
	- Got a total of 2744 images.


Expending the dataset 1 more times:   0%|          | 0/1920 [00:00<?, ?it/s]

	- Augmentation done, GPU memory used: 5.34GiB / free: 15.24GiB / total: 22.17GiB
	- Got 1920 new images and a total of 3840 images.


## Configuring the Tuner with a Scheduler and a Search Algorithm

In [9]:
scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=2, reduction_factor=2)
search_algo = HyperOptSearch()

tune_config = tune.TuneConfig(
    metric="CrackIoU",
    mode="max",
    num_samples=NUM_SAMPLES,
    scheduler=scheduler,
    search_alg=search_algo)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train, train_data=train_data, val_data=val_data),
        resources={"cpu": CPUS_PER_TRIAL, "gpu": GPUS_PER_TRIAL}),
    tune_config=tune_config,
    param_space=search_space)

## Running the Tuner

In [None]:
tuner = tune.Tuner.restore(
    path="~/ray_results/train_2023-03-17_13-15-27/",
    trainable=tune.with_resources(
        tune.with_parameters(train, train_data=train_data, val_data=val_data),
        resources={"cpu": CPUS_PER_TRIAL, "gpu": GPUS_PER_TRIAL}))

In [None]:
results = tuner.fit()

2023-03-20 10:43:22,797	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Current time:,2023-03-20 16:23:42
Running for:,05:40:06.36
Memory:,34.0/125.4 GiB

Trial name,status,loc,NM,SL,batch_size,input_filter,input_layer,loss_fn,lr,iter,total time (s),Epoch,Loss,CrackIoU
train_3f492d9b,RUNNING,141.223.108.122:13234,False,True,4,,SobelFilter,JaccardLoss,0.0001,1.0,404.065,1.0,0.227214,0.553854
train_9e7d1aba,RUNNING,141.223.108.122:13321,True,True,4,,SobelFilter,(FocalLoss+Foca_7c70,0.0001,,,,,
train_17477bf9,PENDING,,False,False,4,,,JaccardLoss,0.0001,,,,,
train_b49c5ec4,TERMINATED,141.223.108.122:13234,False,False,4,<function inver_a5f0,,CrossEntropyLoss(),0.0001,2.0,803.627,2.0,0.00709964,0.524054
train_f57c0247,TERMINATED,141.223.108.122:13321,False,False,4,,SobelFilter,(FocalLoss+Foca_47c0,0.0001,15.0,5688.85,15.0,0.0110812,0.722994
train_5f2f5b13,TERMINATED,141.223.108.122:13234,False,True,4,<function inver_a5f0,LaplacianFilter,(FocalLoss+Foca_4fa0,0.0001,4.0,1615.66,4.0,0.0693425,0.740707
train_e06393f4,TERMINATED,141.223.108.122:13234,False,False,4,,SobelFilter,(B:FocalLoss+V:_2500,0.0001,15.0,6013.11,15.0,0.0550434,0.78741
train_96f08d9d,TERMINATED,141.223.108.122:13321,False,False,4,<function inver_a5f0,SobelFilter,(FocalLoss+Foca_5750,0.0001,15.0,5685.02,15.0,0.0604369,0.774816
train_92064c4d,TERMINATED,141.223.108.122:13234,False,True,4,,,TverskyLoss,0.0001,2.0,800.654,2.0,0.155389,0.657915
train_dd36fdab,TERMINATED,141.223.108.122:13234,True,False,4,<function inver_a5f0,,(B:FocalLoss+V:_8eb0,0.0001,2.0,801.878,2.0,0.0962959,0.666161


Trial name,CrackIoU,Epoch,Loss,MeanIoU,Tversky,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_39a6d2f6,0.684427,2,0.0845465,0.839631,0.852816,2023-03-20_15-53-37,True,,bdf8ba7b44af4224bccbb3b689931187,pirl-PowerEdge-T640,2,141.223.108.122,13321,True,765.512,380.296,765.512,1679295217,0,,2,39a6d2f6,0.0294926
train_3f492d9b,0.553854,1,0.227214,0.772786,0.764319,2023-03-20_16-17-58,False,,65a9cb3f82ff4743aff93bcb6ddab20f,pirl-PowerEdge-T640,1,141.223.108.122,13234,True,404.065,404.065,404.065,1679296678,0,,1,3f492d9b,0.0292459
train_5dd370c5,0.722238,4,0.0096748,0.858971,0.874625,2023-03-20_16-19-16,True,,bdf8ba7b44af4224bccbb3b689931187,pirl-PowerEdge-T640,4,141.223.108.122,13321,True,1538.84,383.556,1538.84,1679296756,0,,4,5dd370c5,0.0294926
train_5f2f5b13,0.740707,4,0.0693425,0.868359,0.878377,2023-03-20_11-23-58,True,,65a9cb3f82ff4743aff93bcb6ddab20f,pirl-PowerEdge-T640,4,141.223.108.122,13234,True,1615.66,402.897,1615.66,1679279038,0,,4,5f2f5b13,0.0292459
train_7c3b8869,0.791605,8,0.0961495,0.894269,0.90385,2023-03-20_14-24-32,True,,65a9cb3f82ff4743aff93bcb6ddab20f,pirl-PowerEdge-T640,8,141.223.108.122,13234,True,3217.46,401.577,3217.46,1679289872,0,,8,7c3b8869,0.0292459
train_80c589fe,0.800267,15,0.101277,0.898723,0.899894,2023-03-20_15-28-03,True,,bdf8ba7b44af4224bccbb3b689931187,pirl-PowerEdge-T640,15,141.223.108.122,13321,True,5686.66,378.814,5686.66,1679293683,0,,15,80c589fe,0.0294926
train_92064c4d,0.657915,2,0.155389,0.826024,0.844611,2023-03-20_13-17-32,True,,65a9cb3f82ff4743aff93bcb6ddab20f,pirl-PowerEdge-T640,2,141.223.108.122,13234,True,800.654,398.167,800.654,1679285852,0,,2,92064c4d,0.0292459
train_96f08d9d,0.774816,15,0.0604369,0.885784,0.890203,2023-03-20_13-53-16,True,,bdf8ba7b44af4224bccbb3b689931187,pirl-PowerEdge-T640,15,141.223.108.122,13321,True,5685.02,378.742,5685.02,1679287996,0,,15,96f08d9d,0.0294926
train_a4cf2457,0.789891,8,0.0061726,0.893424,0.90246,2023-03-20_15-31-10,True,,65a9cb3f82ff4743aff93bcb6ddab20f,pirl-PowerEdge-T640,8,141.223.108.122,13234,True,3196.51,399.018,3196.51,1679293870,0,,8,a4cf2457,0.0292459
train_afcd0fde,0.655343,2,0.0829205,0.824679,0.843387,2023-03-20_14-37-54,True,,65a9cb3f82ff4743aff93bcb6ddab20f,pirl-PowerEdge-T640,2,141.223.108.122,13234,True,801.597,398.858,801.597,1679290674,0,,2,afcd0fde,0.0292459


2023-03-20 10:57:03,222	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'input_filter': <function invert at 0x7ffa4cbfa5f0>, 'loss_fn': CrossEntropyLoss()}
2023-03-20 11:23:58,957	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'input_filter': <function invert at 0x7ffa4cbfa5f0>, 'input_layer': LaplacianFilter, 'loss_fn': (FocalLoss+FocalTverskyLoss)}
2023-03-20 12:18:31,602	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'input_layer': SobelFilter, 'loss_fn': (FocalLoss+FocalTverskyLoss)}
2023-03-20 13:04:12,146	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'input_layer': SobelFilter, 'loss_fn': (B:FocalLoss+V:FocalTverskyLoss)}
2023-03-20 13:17:32,874	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'loss_fn': Tversky

# Evaluating the best Results

In [None]:
best_result = results.get_best_result(metric="CrackIoU", mode="max", scope="all")  # Get best result object
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(best_result.metrics["Loss"]))
print("Best trial final validation CrackIoU: {}".format(best_result.metrics["CrackIoU"]))

# evaluate(test_data=test_data, best_result=best_result)

results_df = results.get_dataframe(filter_metric="CrackIoU", filter_mode="max")  # Get all trials by CrackIoU
results_df.sort_values("CrackIoU", ascending=False, inplace=True)

evaluate_df(test_data=test_data, results_df=results_df)