<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setting-up-imports" data-toc-modified-id="Setting-up-imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setting up imports</a></span></li><li><span><a href="#Setting-up-Constant-Hyperparameters" data-toc-modified-id="Setting-up-Constant-Hyperparameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setting up Constant Hyperparameters</a></span><ul class="toc-item"><li><span><a href="#Creating-the-training-function" data-toc-modified-id="Creating-the-training-function-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Creating the training function</a></span></li><li><span><a href="#Creating-the-evaluation-function" data-toc-modified-id="Creating-the-evaluation-function-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Creating the evaluation function</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Setting-up-the-loss-function-sampler" data-toc-modified-id="Setting-up-the-loss-function-sampler-2.2.0.1"><span class="toc-item-num">2.2.0.1&nbsp;&nbsp;</span>Setting up the loss function sampler</a></span></li></ul></li></ul></li></ul></li><li><span><a href="#Running-the-training" data-toc-modified-id="Running-the-training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Running the training</a></span><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Hyperparameters-Search-Space" data-toc-modified-id="Hyperparameters-Search-Space-3.0.0.1"><span class="toc-item-num">3.0.0.1&nbsp;&nbsp;</span>Hyperparameters Search Space</a></span></li><li><span><a href="#Selecting-Cuda-device" data-toc-modified-id="Selecting-Cuda-device-3.0.0.2"><span class="toc-item-num">3.0.0.2&nbsp;&nbsp;</span>Selecting Cuda device</a></span></li><li><span><a href="#Loading-data-for-training" data-toc-modified-id="Loading-data-for-training-3.0.0.3"><span class="toc-item-num">3.0.0.3&nbsp;&nbsp;</span>Loading data for training</a></span></li><li><span><a href="#Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm" data-toc-modified-id="Configuring-the-Tuner-with-a-Scheduler-and-a-Search-Algorithm-3.0.0.4"><span class="toc-item-num">3.0.0.4&nbsp;&nbsp;</span>Configuring the Tuner with a Scheduler and a Search Algorithm</a></span></li><li><span><a href="#Running-the-Tuner" data-toc-modified-id="Running-the-Tuner-3.0.0.5"><span class="toc-item-num">3.0.0.5&nbsp;&nbsp;</span>Running the Tuner</a></span></li><li><span><a href="#Displaying-and-Evaluating-the-best-Result" data-toc-modified-id="Displaying-and-Evaluating-the-best-Result-3.0.0.6"><span class="toc-item-num">3.0.0.6&nbsp;&nbsp;</span>Displaying and Evaluating the best Result</a></span></li></ul></li></ul></li></ul></li></ul></div>

# Setting up imports

In [1]:
import os

import torch
from torch.nn import CrossEntropyLoss
from torch.nn.functional import normalize
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
from torchvision.transforms import GaussianBlur

import ray
from ray import tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch


from Dataset import POCDataReader, data_augment_, POCDataset
from metrics import Metrics, EvaluationMetrics
from models import UNet
from loss import *
from train import training_loop, validation_loop
from train_tqdm import evaluation_loop


# Setting up Constant Hyperparameters

In [2]:
EPOCHS = 12
NUM_SAMPLES = 30
NUM_MODEL_TEST = 5

NB_AUGMENT = 2

LOAD_DATA_ON_GPU = False
GPUS_PER_TRIAL = 1
CPUS_PER_TRIAL = 20

## Creating the training function

In [3]:
def train(config, train_data, val_data):

    device = "cuda" if torch.cuda.is_available() else "cpu"

    train_dataset = POCDataset(
        train_data,
        transform=normalize,
        target_transform= GaussianBlur(kernel_size=3, sigma=0.7) if config["SL"] else None,
        negative_mining=config["NM"])
    training_dataloader = DataLoader(
        train_dataset,
        batch_size=int(config["batch_size"]),
        sampler=train_dataset.sampler,
        num_workers=CPUS_PER_TRIAL//2,
        pin_memory=True,
        pin_memory_device=device)

    val_dataset = POCDataset(val_data, transform=normalize, target_transform=None, negative_mining=False)
    validation_dataloader = DataLoader(
        val_dataset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=CPUS_PER_TRIAL//2,
        pin_memory=True,
        pin_memory_device=device)

    model = UNet(n_channels=1, n_classes=2, bilinear=True, crop=False)
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    loss_fn = config["loss_fn"].to(device)
    optimizer = Adam(model.parameters(), lr=config["lr"], betas=(0.9, 0.99))
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS//2)

    # To restore a checkpoint, use `session.get_checkpoint()`.
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state, scheduler_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        lr_scheduler.load_state_dict(scheduler_state)

    train_metrics = Metrics(
        buffer_size=len(training_dataloader),
        mode="Training",
        model_name=model.__class__.__name__,
        loss_name=loss_fn.__class__.__name__,
        opt_name=optimizer.__class__.__name__,
        batch_size=config["batch_size"],
        learning_rate=config["lr"],
        negative_mining=config["NM"],
        soft_labels=config["SL"],
        device=device)

    val_metrics = Metrics(
        buffer_size=len(validation_dataloader),
        mode="Validation",
        model_name=model.__class__.__name__,
        loss_name=loss_fn.__class__.__name__,
        opt_name=optimizer.__class__.__name__,
        batch_size=config["batch_size"],
        learning_rate=config["lr"],
        negative_mining=config["NM"],
        soft_labels=config["SL"],
        device=device)


    for epoch in range(1, EPOCHS+1):  # loop over the dataset multiple times
        training_loop(epoch, training_dataloader, model, loss_fn, optimizer, lr_scheduler, train_metrics, device)
        validation_loop(epoch, validation_dataloader, model, loss_fn, val_metrics, device)

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and can be accessed through `session.get_checkpoint()`
        # API in future iterations.
        os.makedirs("model", exist_ok=True)
        torch.save((model.state_dict(), optimizer.state_dict(), lr_scheduler.state_dict()), "model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("model")
        session.report(metrics=val_metrics.get_metrics(epoch), checkpoint=checkpoint)

    train_metrics.close_tensorboard()
    val_metrics.close_tensorboard()


## Creating the evaluation function

In [4]:
def evaluate(test_data, best_result):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    test_dataset = POCDataset(test_data, transform=normalize, target_transform=None, negative_mining=False)
    evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=20, pin_memory=True, pin_memory_device=device)

    best_trained_model = UNet(n_channels=1, n_classes=2, bilinear=True, crop=False).to(device)

    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, _, _ = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    test_metrics = EvaluationMetrics(
        buffer_size=len(evaluation_dataloader),
        model_name=best_trained_model.__class__.__name__,
        loss_name=best_result.config["loss_fn"].__class__.__name__,
        opt_name="Adam",
        epochs=EPOCHS,
        batch_size=best_result.config["batch_size"],
        learning_rate=best_result.config["lr"],
        negative_mining=best_result.config["NM"],
        soft_labels=best_result.config["SL"],
        device=device)

    evaluation_loop(dataloader=evaluation_dataloader, model=best_trained_model, metric=test_metrics, device=device)
    
def evaluate_df(test_data, results_df):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    test_dataset = POCDataset(test_data, transform=normalize, target_transform=None, negative_mining=False)
    evaluation_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=20, pin_memory=True, pin_memory_device=device)

    results_df.sort_values("CrackIoU", ascending=False, inplace=True)
    for index, res in results_df.head(NUM_MODEL_TEST).iterrows():

        trained_model = UNet(n_channels=1, n_classes=2, bilinear=True, crop=False).to(device)

        checkpoint_path = os.path.join(res["logdir"], "model/checkpoint.pt")
        model_state, _, _ = torch.load(checkpoint_path)
        trained_model.load_state_dict(model_state)

        test_metrics = EvaluationMetrics(
            buffer_size=len(evaluation_dataloader),
            model_name=trained_model.__class__.__name__,
            loss_name=res["config/loss_fn"],
            opt_name="Adam",
            epochs=res["Epoch"],
            batch_size=res["config/batch_size"],
            learning_rate=res["config/lr"],
            negative_mining=res["config/NM"],
            soft_labels=res["config/SL"],
            device=device)

        evaluation_loop(dataloader=evaluation_dataloader, model=trained_model, metric=test_metrics, device=device)


#### Setting up the loss function sampler

In [5]:
def loss_fn_sampler():
    pixel_losses_list = [
        CrossEntropyLoss(weight=torch.tensor([.3, .7])), 
        FocalLoss(weight=torch.tensor([.3, .7]), gamma=2)
    ]
    volume_losses_list = [
        JaccardLoss(),
        TverskyLoss(alpha=0.3, beta=0.7),
        FocalTverskyLoss(alpha=0.3, beta=0.7, gamma=2)
    ]
    loss_combinators_list = [ CombinedLoss, BorderedLoss ]

    complete_list = pixel_losses_list + volume_losses_list

    for combinator in loss_combinators_list:
        complete_list += [combinator(loss1, loss2) for loss1 in pixel_losses_list for loss2 in volume_losses_list]

    return complete_list


# Running the training

#### Hyperparameters Search Space

In [6]:
search_space = {
    "lr": tune.qloguniform(1e-5, 1e-2, 5e-6),
    "batch_size": tune.qrandint(2, 8, 2),
    "NM": tune.choice([True, False]),
    "SL": tune.choice([True, False]),
    "loss_fn": tune.choice(loss_fn_sampler()),
}

#### Selecting Cuda device

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


#### Loading data for training

In [8]:
data_reader = POCDataReader(root_dir="../data", load_on_gpu=LOAD_DATA_ON_GPU)
train_data, val_data, test_data = data_reader.split([0.7, 0.1, 0.2])
data_augment_(train_data, n=NB_AUGMENT, load_on_gpu=LOAD_DATA_ON_GPU)

Loading dataset into RAM: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2744/2744 [00:36<00:00, 75.05it/s]


	- Loading done, RAM used: 3.78GiB / free: 117.85GiB / total: 125.40GiB
	- Got a total of 2744 images.


Expending the dataset 2 more times: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1920/1920 [00:32<00:00, 59.35it/s]

	- Augmentation done, RAM used: 7.19GiB / free: 114.41GiB / total: 125.40GiB
	- Got 3840 new images and a total of 5760 images.





#### Configuring the Tuner with a Scheduler and a Search Algorithm

In [None]:
scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2)
search_algo = HyperOptSearch()

tune_config = tune.TuneConfig(
    metric="CrackIoU",
    mode="max",
    num_samples=NUM_SAMPLES,
    scheduler=scheduler,
    search_alg=search_algo)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train, train_data=train_data, val_data=val_data),
        resources={"cpu": CPUS_PER_TRIAL, "gpu": GPUS_PER_TRIAL}),
    tune_config=tune_config,
    param_space=search_space)

#### Running the Tuner

In [None]:
results = tuner.fit()

#### Displaying and Evaluating the best Result

In [None]:
best_result = results.get_best_result(metric="CrackIoU", mode="max", scope="all")  # Get best result object
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(best_result.metrics["Loss"]))
print("Best trial final validation CrackIoU: {}".format(best_result.metrics["CrackIoU"]))

# evaluate(test_data=test_data, best_result=best_result)

results_df = results.get_dataframe(filter_metric="CrackIoU", filter_mode="max")  # Get all trials by CrackIoU
results_df.sort_values("CrackIoU", ascending=False, inplace=True)

evaluate_df(test_data=test_data, results_df=results_df)