## Info
This notebook is designed for hyperparameter search. The experimental configurations are loaded from the `config.yaml` file, and the training setup is initialized accordingly. Note that absolute paths of the dataset files and model weights should be used in the `config.yaml` file.

During the training process:
- Ray library is used.
- The training configuration file (`config.yaml`) is copied to the output directory for reference.
- Search progress can be monitored in the notebook.
- It could be useful to implement your custom data sampling procedure to speed up hyperparameter search.
- You can find more details about Ray Tune [here](https://docs.ray.io/en/latest/tune/index.html).

The setup ensures that key information related to hyperparameter search is easily accessible and logged for future analysis and model comparison.

In [None]:
import datetime
import os
import random
import shutil
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import numpy as np
import pandas as pd
import ray
from ray import tune
from ray.air import RunConfig
from ray.air import session
from ray.train import Checkpoint
from ray.tune.search import ConcurrencyLimiter
from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.schedulers import ASHAScheduler
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms as tt

from data import ClassificationDataset
from data import Transforms as T
from models import Evaluator, GMIC, GMICLoss, Trainer
from utils import Config

In [None]:
# torch.manual_seed(0)
# random.seed(0)

In [None]:
def sample_subset(metadata, n_breastid=100):
    # Implement your custom sampling algoritm.
    return metadata

In [None]:
def get_dataset(cfg):
    """
    Creates and returns training and validation datasets with specified transformations.

    Parameters
    ----------
    cfg : Config
        Configuration object containing dataset paths and parameters.

    Returns
    -------
    tuple
        A tuple containing (train_dataset, val_dataset), where each is a ClassificationDataset
        with appropriate transforms applied.
    """
    
    transform_train = {'dicom': None, 'pytorch': None}
    transform_val = {'dicom': None, 'pytorch': None}

    transform_train['dicom'] = [T.UIntToFloat32(), T.StandardScoreNormalization(),
                                T.RandomGaussianNoise(mean=.0, std=.005)]

    transform_train['pytorch'] = tt.Compose([tt.RandomHorizontalFlip(p=0.5),
                                             tt.RandomRotation([-15, +15]),
                                             tt.RandomAffine(degrees=0, translate=(0, 0.1), shear=(-25, +25)),
                                             tt.RandomResizedCrop((cfg.data.inp_height, cfg.data.inp_width),
                                                                  scale=(0.8, 1.6))])

    transform_val['dicom'] = [T.UIntToFloat32(), T.StandardScoreNormalization()]

    train = ClassificationDataset(cfg.data.train_xlsx_path, transform=transform_train)
    val = ClassificationDataset(cfg.data.val_xlsx_path, transform=transform_val)

    # train.metadata = sample_subset(train.metadata, n_breastid=250)

    return train, val

In [None]:
def crate_dataloaders(cfg, train, val):
    """
    Creates data loaders for training and validation datasets.

    Parameters
    ----------
    cfg : Config
        Configuration object containing batch size and other parameters
    train : Dataset
        Training dataset
    val : Dataset
        Validation dataset

    Returns
    -------
    tuple
        (train_loader, val_loader) - Tuple containing training and validation DataLoader objects
    """

    train_loader = DataLoader(train, batch_size=cfg.data.batch_size, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val, batch_size=cfg.data.batch_size, shuffle=True, num_workers=2, pin_memory=True)

    return train_loader, val_loader

In [None]:
def get_model(cfg):
    """
    Creates the model, loads pretrained weights and moves it to the specified device defined in the config file.
    
    Parameters
    ----------
    cfg : Config
        Configuration object containing model parameters and paths
        
    Returns
    -------
    model
        Initialized the model with loaded weights on the specified device
    """

    model = GMIC(cfg.gmic_parameters)
    pretrained_model = torch.load(cfg.model.weight_path, map_location=torch.device('cpu'))
    model.load_state_dict(pretrained_model, strict=False)
    model.to(cfg.gmic_parameters.device_type)

    return model

In [None]:
def objective(config, training_cfg_path):
    """
    Training objective function for hyperparameter optimization using Ray Tune.
    
    Parameters
    ----------
    config : dict
        Dictionary containing the hyperparameters to optimize:
    training_cfg_path : str
        Path to the training configuration YAML file
        
    Returns
    -------
    None
        Results are reported to Ray Tune via session.report()
    """
        
    cfg = Config(training_cfg_path)
    cfg['train']['lr'] = config['lr']
    cfg['train']['beta'] = config['beta']

    
    train, val = get_dataset(cfg)
    train_loader, val_loader = crate_dataloaders(cfg, train, val)
    model = get_model(cfg)
    
    # Set other model components.
    criterion = GMICLoss(beta=cfg.train.beta)
    optimizer = optim.Adam(model.parameters(), lr=cfg.train.lr, weight_decay=0.001)
    
    trainer = Trainer(criterion=criterion, model=model, optimizer=optimizer, 
                      total_epochs=cfg.train.epoch, data_loader=train_loader)
    evaluator = Evaluator(model=model, data_loader=val_loader)
        
    for epoch in range(0, cfg.train.epoch): 
        train_metrics = trainer.fit(prog_bar=False)
        val_metrics = evaluator.evaluate(prog_bar=False)

        metrics = {'train/roc_auc': train_metrics['roc']['auc'],
                   'train/pr_auc': train_metrics['pr']['auc'],                  
                   'total_loss': train_metrics['total_loss'],  
                   'val/roc_auc': val_metrics['roc']['auc'],
                   'val/pr_auc': val_metrics['pr']['auc']                            
        }

        tempdir = os.path.join(session.get_trial_dir(), 'checkpoint')
        os.makedirs(tempdir, exist_ok=True)
        torch.save({'epoch': epoch, 
                    'model_state': model.state_dict()},
                    os.path.join(tempdir, 'checkpoint.pt'))
            
        # Send the current training result back to Tune.
        session.report(metrics=metrics, checkpoint=Checkpoint.from_directory(tempdir))
        # session.report(metrics)

In [None]:
# This makes the Ray Dashboard accessible from any IP address.
ray.init(dashboard_host='0.0.0.0')  

In [None]:
# If the search is interrupted, set resume=True to continue from the last checkpoint.
resume = False

In [None]:
# Training configuration path. 
# !!! Do not use absolute paths in the config file.
training_cfg_path = '/home/user/project_name/notebooks/batch_norm_experiment/config.yaml'

# Output path of ray results.
storage_path = '/home/user/project_name/models/hparam_search/'

# Define experiment name. Date will be an identifier for hyperparameter optimization experiments.
date = str(datetime.date.today())
date = date.replace('-', '_')
experiment_name = '{}_lr.beta_search'.format(date)

# Define hyperparameter search space.
search_space = {
    'lr':   tune.loguniform(10**-5.5, 10**-4),
    'beta': tune.loguniform(10**-5.5, 10**-3.5)
}

# Trainable method for ray tune.
trainable_with_gpu = tune.with_resources(tune.with_parameters(objective, training_cfg_path=training_cfg_path), 
                                         resources={'cpu':2, 'gpu': 2})

# Hyperparameter search algorithm grid, random, Bayesian Optimization
bayesopt = None # BayesOptSearch(metric='train/pr_auc', mode='max')

# Tune configuration
tune_config = tune.TuneConfig(num_samples=50, scheduler=ASHAScheduler(metric='val/pr_auc', 
                                                                     mode='max', grace_period=10), search_alg=bayesopt)

# Run configuration
run_config = RunConfig(name=experiment_name, storage_path=storage_path)

if not resume:
    tuner = tune.Tuner(trainable_with_gpu,
                       tune_config=tune_config,                  
                       param_space=search_space,
                       run_config=run_config,
                       )
    
    # Copy training configuration like dataset file paths, model path, etc. into this folder too.
    shutil.copy2(training_cfg_path, os.path.join(storage_path, experiment_name))
else:
    tuner = tune.Tuner.restore(path=os.path.join(storage_path, experiment_name),
                              trainable=trainable_with_gpu)

# Start hyperparameter search.
results = tuner.fit()

In [None]:
# Define the experiment path to load data of all experiments
experiment_path = os.path.join(storage_path, experiment_name)

# Load analysis from the results directory
analysis = tune.ExperimentAnalysis(experiment_path)

# Set the columns to be visualized
columns = ['config/lr', 'config/beta', 'val/roc_auc', 'val/pr_auc', 'train/roc_auc', 'train/pr_auc', 'total_loss']

# Create the DataFrame and sort based on the metric
all_trials = pd.concat(analysis.trial_dataframes.values(), ignore_index=True)
all_trials = all_trials.sort_values('val/pr_auc', ascending=False)[columns]

# See the best results
all_trials.head()

In [None]:
# See exact values
print('LR of best result: {}'.format(all_trials.iloc[0, 0]))
print('Beta of best_result: {}'.format(all_trials.iloc[0, 1]))