## Data generation

This notebook creates MMLJobDescriptions - configurations for running training and inference based on the MML framework. MMLJobDescriptions may be rendered as bash scripts that contain the commands to start each of the MML experiments to generate the predictions. Alternatively it allows to start jobs via a runner. It suits both cases of a remote (LSF cluster) and local job submission. To run the notebook and the actual experiments one needs to install the MML framework ('mml-core'). In addition, 'mml-data' provides the necessary code to create the data locally. The provided MML plugin for the data splitting tag needs to be installed ('mml-prevalences').

All resulting bash scripts will be stored in the "training_scripts" folder.

In [1]:
import copy
import os
from pathlib import Path

import mml.interactive
import pandas as pd
import torch

# initialize interactive MML usage
mml.interactive.init(Path(mml.__file__).parent / 'mml.env')
# import default runner and planning utils
from mml.interactive.loading import get_task_structs
from mml.interactive.planning import MMLJobDescription, SubprocessJobRunner

# if mml-lsf is installed also LSF runner is imported
try:
    from mml_lsf.requirements import LSFSubmissionRequirements
    from mml_lsf.runner import LSFJobRunner

    LSF_AVAILABLE = True
except ImportError:
    LSFSubmissionRequirements = None
    LSFJobRunner = None
    LSF_AVAILABLE = False
# list of tasks used in this study
from prev.data_loading import all_tasks

# setting the necessary paths
current_path = os.getcwd()
DATA_PATH = Path(current_path).parent / 'data'
RESULT_PATH = Path(current_path).parent / 'results'
assert DATA_PATH.exists() and RESULT_PATH.exists()
TRAINING_SCRIPTS_PATH = Path(current_path) / 'training_scripts'
# here we will write out commands
os.chdir(TRAINING_SCRIPTS_PATH)

In [3]:
# Configuration of the experiment system
if LSF_AVAILABLE:
    reqs = LSFSubmissionRequirements(num_gpus=1, vram_per_gpu=11.0, queue='gpu', script_name='mml_0_13.sh')
else:
    reqs = mml.interactive.planning.DefaultRequirements()

## Data preparation

In [4]:
# prepare steps
create_cmds = list()
tag_cmds = list()
pp_cmds = list()
# step one: task creation, download datasets, set up task descriptions
create_cmds.append(MMLJobDescription(prefix_req=reqs, mode='create',
                                     config_options={'task_list': all_tasks}))
# step two: preprocess the data
for t in all_tasks:
    pp_cmds.append(MMLJobDescription(prefix_req=reqs, mode='pp',
                                     config_options={'task_list': [t], 'preprocessing': 'default'}))
# step three: redistribute the splits according to the strategy defined in the mml_plugin
for t in all_tasks:
    tag_cmds.append(MMLJobDescription(prefix_req=reqs, mode='info',
                                      config_options={'task_list': [t], 'tagging.all': '+miccai?1337',
                                                      'preprocessing': 'default'}))
# generate descriptions
mml.interactive.write_out_commands(cmd_list=create_cmds, name='01_create_cmds')
mml.interactive.write_out_commands(cmd_list=pp_cmds, name='02_pp_cmds')
mml.interactive.write_out_commands(cmd_list=tag_cmds, name='03_tag_cmds')

## standard experiment repeated

In [5]:
# these reproduce the original computations with the updated mml backend
project = 'mic23_predictions_reproduce'
predict_cmds = list()
for ix in [10, 11, 12, 13, 14, 15]:
    for t in all_tasks:
        opts = {'sampling.balanced': True, 'sampling.batch_size': 300, 'callbacks': 'early',
                ' loss.auto_activate_weighing': False,
                '+callbacks.early.patience': 7, 'mode.nested': False, 'mode.cv': False,
                'lr_scheduler': 'plateau', 'lr_scheduler.patience': 5, 'mode.store_parameters': True, 'seed': ix,
                'mode.subroutines': '[train,predict]', 'proj': f'{project}_{ix}',
                'task_list': [f'{t}+miccai?1337', f'{t}+miccai?1337+nested?0'],
                'pivot.name': f'{t}+miccai?1337', 'mode.eval_on': [f'{t}+miccai?1337', f'{t}+miccai?1337+nested?0'],
                'trainer.max_epochs': 40, 'augmentations': 'baseline256',
                'reuse.clean_up.parameters': True,
                'preprocessing': 'default', 'trainer.min_epochs': 5}
        predict_cmds.append(MMLJobDescription(prefix_req=reqs, config_options=opts, mode='train'))
mml.interactive.write_out_commands(cmd_list=predict_cmds, name='04_reproduce_cmds')

## Interactive job submission

The new introduced runner class allows to submit / run locally the generated jobs alternatively to iterate over the generated command .txt files. 

In [68]:
runner = LSFJobRunner() if LSF_AVAILABLE else SubprocessJobRunner()

In [None]:
for job in predict_cmds:
    runner.run(job)

## re-distribution ablation experiments
the tag would have to replace the tags given in the prediction commands

In [6]:
dist_seeds = [3, 31, 314, 3141, 31415]

In [8]:
alt_dist_seed_cmds = list()
for seed in dist_seeds:
    alt_dist_seed_cmds.append(MMLJobDescription(prefix_req=reqs, mode='info',
                                                config_options={'task_list': all_tasks,
                                                                'tagging.all': f'+miccai?{seed}',
                                                                'preprocessing': 'default',
                                                                'proj': 'default'}))
mml.interactive.write_out_commands(cmd_list=alt_dist_seed_cmds, name='05_dataseed_cmds')

In [9]:
project = 'mic23_predictions_datasplit_seed'
predict_dataseed_cmds = list()
for seed in dist_seeds:
    for t in all_tasks:
        opts = {'sampling.balanced': True, 'sampling.batch_size': 300, 'callbacks': 'early',
                ' loss.auto_activate_weighing': False,
                '+callbacks.early.patience': 7, 'mode.nested': False, 'mode.cv': False,
                'lr_scheduler': 'plateau', 'lr_scheduler.patience': 5, 'mode.store_parameters': True, 'seed': 42,
                # we keep this constant in these experiments
                'mode.subroutines': '[train,predict]', 'proj': f'{project}_{seed}',
                'task_list': [f'{t}+miccai?{seed}', f'{t}+miccai?{seed}+nested?0'],
                'pivot.name': f'{t}+miccai?{seed}',
                'mode.eval_on': [f'{t}+miccai?{seed}', f'{t}+miccai?{seed}+nested?0'],
                'trainer.max_epochs': 40, 'augmentations': 'baseline256',
                'reuse.clean_up.parameters': True,
                'preprocessing': 'default', 'trainer.min_epochs': 5}
        predict_dataseed_cmds.append(MMLJobDescription(prefix_req=reqs, config_options=opts, mode='train'))
mml.interactive.write_out_commands(cmd_list=predict_dataseed_cmds, name='06_dataseed_predict_cmds')

## Adapt loss weights during training 

In our extension we also tested the impact of adapting the cross entropy weights during training (both with perfectly known and  imperfectly estimated prevalences). For prevalence estimation we rely on results of the ACC method. These are stored as `24_prev_estimation.pkl` in the `results` folder.

In [10]:
def imbalance_ratio(class_prevalences: torch.Tensor) -> float:
    """Calculates the imbalance ratio."""
    min_frac = torch.min(class_prevalences)
    max_frac = torch.max(class_prevalences)
    return max_frac / min_frac


def scale_prevalences_ir(prev_class_prevalences: torch.Tensor, ir: float = 1.):
    """Re-implements the same method in `prev.scaling` but returns the prevalences instead of data."""
    class_prevalences = copy.deepcopy(prev_class_prevalences)
    # compute the original imbalance ratio
    orig_ir = imbalance_ratio(class_prevalences)
    min_frac = torch.min(class_prevalences)
    max_frac = torch.max(class_prevalences)
    # find index of class with maximal number of indices
    max_class = torch.argmax(class_prevalences)
    # iterate over classes
    for i, value in enumerate(class_prevalences):
        if ir >= orig_ir:
            # downsample all but the max_class
            if i != max_class:
                class_prevalences[i] = (class_prevalences[i] * max_frac) / (
                        min_frac * ir)  # undersample smaller classes
        else:
            # calculate the temperature
            temp = (ir - 1) / (orig_ir - 1)
            class_prevalences[i] = min_frac + temp * (class_prevalences[i] - min_frac)
    class_prevalences = class_prevalences / class_prevalences.sum()
    new_ir = imbalance_ratio(class_prevalences)
    assert torch.isclose(new_ir, torch.tensor(ir)), f"{ir=} {new_ir.item()=}"
    return class_prevalences

In [11]:
# gather prevalences in training data
train_prevs = {}
structs = get_task_structs(tasks=[t + '+miccai?1337' for t in all_tasks])
with mml.interactive.default_file_manager():
    for struct in structs:
        name = struct.name.split('+')[0]
        cls_occ = {idx: struct.class_occ[cls_name] for idx, cls_name in struct.idx_to_class.items()}
        assert set(cls_occ.keys()) == set(range(len(cls_occ)))  # no subclasses ?
        train_prevs[name] = torch.tensor([cls_occ[idx] for idx in range(len(cls_occ))],
                                         dtype=torch.float) / struct.num_samples
        assert train_prevs[name].sum() == 1

In [12]:
# load estimated prevalences - generated by notebook 3 on the original predictions
estimated_prevalences = pd.read_pickle(RESULT_PATH / '24_prev_estimation_df.pkl').set_index(['task', 'ir'])['ACC']

In [13]:
# function to determine cross entropy loss weights, when re-training with anticipated prevalence shift
def get_loss_weights(task: str, target_ir: float, balanced_sampling: bool = False, estimated: bool = False):
    current = train_prevs[task]
    if estimated:
        eval = torch.tensor(estimated_prevalences.loc[task, target_ir])
    else:
        eval = scale_prevalences_ir(current, target_ir)
    if balanced_sampling:
        weights = eval  # model will see each class equally often
    else:
        weights = eval / current  # model will see prevalent classes more often
    return (weights / weights.sum()).tolist()

In [14]:
adapted_retraining_cmds = list()
for ix in range(1):  # we did not repeat this with multiple random seeds
    for t in all_tasks:
        for ir in [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.0]:
            opts = {'sampling.balanced': True, 'sampling.batch_size': 300, 'callbacks': 'early',
                    ' loss.auto_activate_weighing': False,
                    '+callbacks.early.patience': 7, 'mode.nested': False, 'mode.cv': False,
                    'lr_scheduler': 'plateau', 'lr_scheduler.patience': 5, 'mode.store_parameters': True, 'seed': ix,
                    'mode.subroutines': '[train,predict]', 'proj': f'mic23_predictions_extension_balanced_{ix}_{ir}',
                    'task_list': [f'{t}+miccai?1337', f'{t}+miccai?1337+nested?0'],
                    'pivot.name': f'{t}+miccai?1337', 'mode.eval_on': [f'{t}+miccai?1337', f'{t}+miccai?1337+nested?0'],
                    'trainer.max_epochs': 40, 'augmentations': 'baseline256',
                    'reuse.clean_up.parameters': True,
                    'loss.class_weights': get_loss_weights(task=t, target_ir=ir, balanced_sampling=True),
                    'preprocessing': 'default', 'trainer.min_epochs': 5}
            adapted_retraining_cmds.append(MMLJobDescription(prefix_req=reqs, mode='train', config_options=opts))
mml.interactive.write_out_commands(cmd_list=adapted_retraining_cmds, name='07_retraining_cmds')

In [15]:
adapted_retraining_estimated_cmds = list()
for ix in range(1):  # we did not repeat this with multiple random seeds
    for t in all_tasks:
        for ir in [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.0]:
            opts = {'sampling.balanced': True, 'sampling.batch_size': 300, 'callbacks': 'early',
                    ' loss.auto_activate_weighing': False,
                    '+callbacks.early.patience': 7, 'mode.nested': False, 'mode.cv': False,
                    'lr_scheduler': 'plateau', 'lr_scheduler.patience': 5, 'mode.store_parameters': True, 'seed': ix,
                    'mode.subroutines': '[train,predict]',
                    'proj': f'mic23_predictions_extension_balanced_estimated_{ix}_{ir}',
                    'task_list': [f'{t}+miccai?1337', f'{t}+miccai?1337+nested?0'],
                    'pivot.name': f'{t}+miccai?1337', 'mode.eval_on': [f'{t}+miccai?1337', f'{t}+miccai?1337+nested?0'],
                    'trainer.max_epochs': 40, 'augmentations': 'baseline256',
                    'reuse.clean_up.parameters': True,
                    'loss.class_weights': get_loss_weights(task=t, target_ir=ir, balanced_sampling=True,
                                                           estimated=True),
                    'preprocessing': 'default', 'trainer.min_epochs': 5}
            adapted_retraining_estimated_cmds.append(
                MMLJobDescription(prefix_req=reqs, mode='train', config_options=opts))
mml.interactive.write_out_commands(cmd_list=adapted_retraining_estimated_cmds, name='08_retraining_estimated_cmds')