In [1]:
# this notebook generates all commands for the recent mml version 
import dataclasses
import os
import warnings
from pathlib import Path
from typing import Dict, Union

try:
    import mml.interactive
except ImportError:
    raise RuntimeError('This reproduction expects a recent version of MML - please refer to the README for detailed instructions.')

mml.interactive.init(Path('~/.config/mml.env').expanduser())
from mml.interactive import DefaultRequirements, MMLJobDescription
from mml_tf.tasks import all_tasks, get_valid_sources, shrinkable_tasks, target_tasks, source_tasks, train_tasks, \
    all_tasks_including_shrunk, task_infos

CLUSTER_USAGE = False  # change if you (want / do not want) to run on the cluster

if CLUSTER_USAGE:
    from mml_lsf.requirements import LSFSubmissionRequirements

 _____ ______   _____ ______   ___
|\   _ \  _   \|\   _ \  _   \|\  \
\ \  \\\__\ \  \ \  \\\__\ \  \ \  \
 \ \  \\|__| \  \ \  \\|__| \  \ \  \
  \ \  \    \ \  \ \  \    \ \  \ \  \____
   \ \__\    \ \__\ \__\    \ \__\ \_______\
    \|__|     \|__|\|__|     \|__|\|_______|
         ____  _  _    __  _  _  ____  _  _
        (  _ \( \/ )  (  )( \/ )/ ___)( \/ )
         ) _ ( )  /    )( / \/ \\___ \ )  /
        (____/(__/    (__)\_)(_/(____/(__/
Interactive MML API initialized.


In [2]:
# note that final experiments have to be run multiple times to ensure significance
rerun = 3

In [3]:
if CLUSTER_USAGE:
    # cluster submission prepends, add yours here in case you have other gpu requirements
    base_reqs = LSFSubmissionRequirements(special_requirements=[],
                                          undesired_hosts=['e230-dgx2-2', 'e230-dgxa100-2', 'e230-dgxa100-4',
                                                           'e230-dgxa100-1',
                                                           'e230-dgxa100-2', 'e230-dgxa100-3', 'e230-dgxa100-4',
                                                           'lsf22-gpu08', 'lsf22-gpu01', 'lsf22-gpu02', 'lsf22-gpu03',
                                                           'lsf22-gpu04', 'lsf22-gpu05', 'lsf22-gpu06', 'lsf22-gpu07'],
                                          num_gpus=1, vram_per_gpu=11.0, queue='gpu-lowprio',
                                          mail='EMAIL.ADDRESS@dkfz-heidelberg.de', script_name='mml.sh',
                                          job_group='/USERNAME/pami_rerun'
                                          )
    # see for example this local setup
    # base_reqs = pp_reqs = aa_reqs = def_reqs = arch_reqs = tl_reqs = multi_reqs = nb.DefaultRequirements()
    pp_reqs = dataclasses.replace(base_reqs, queue='gpu')
    aa_reqs = dataclasses.replace(base_reqs, script_name='aa.sh', vram_per_gpu=13.0)
    def_reqs = dataclasses.replace(base_reqs, special_requirements=['tensorcore'])
    tl_reqs = dataclasses.replace(base_reqs, special_requirements=['tensorcore'], vram_per_gpu=24.0)
    multi_reqs = dataclasses.replace(base_reqs, special_requirements=['tensorcore'], vram_per_gpu=14.0)
else:
    base_reqs = pp_reqs = aa_reqs = def_reqs = tl_reqs = multi_reqs = DefaultRequirements()

In [4]:
# project overview -> points to MML projects we use, we will append indices for each "rerun"
projects = {
    'base': 'pami2_base_02',
    'raw_baseline': 'pami2_raw_03',
    'raw_shrunk': 'pami2_raw_shrunk_10',
    # aa search can not be carried out with recent MML version, we provide the created policies in data/auto_augmentations
    'aa_search': 'pami2_t_aa_search_01',
    # the above are shared with train (!) since stuff is only computed once anyway
    'transfer': 'pami2_t_transfer_20',
    'multi_task': 'test_multi_balanced_test_split_10', 
    'multi_shrunk': 'test_multi_balanced_shrunk_test_split_10',
    'arch_search': 'pami2_t_arch_search_02',
    'arch_infer': 'pami2_t_arch_infer_02',
    'aa_infer': 'pami2_t_aa_infer_02'
}

In [5]:
# prepare steps
prep_cmds = list()
# step one: plain task creation

prep_cmds.append(MMLJobDescription(prefix_req=pp_reqs, mode='create',
                                   config_options={'tasks': 'pami', 'proj': projects['base']}))
# step two: plain task preprocessing
prep_cmds.append(MMLJobDescription(prefix_req=pp_reqs, mode='pp',
                                   config_options={'tasks': 'pami', 'proj': projects['base']}))
# step three: shrunk preprocessing
prep_cmds.append(MMLJobDescription(prefix_req=pp_reqs, mode='info',
                                   config_options={'tasks': 'pami_shrinkable_800', 'proj': projects['base']}))
mml.interactive.write_out_commands(cmd_list=prep_cmds, name='prepare')

Stored 3 commands at prepare.txt.


In [6]:
# OPTIONALLY: compute dimensions (used for Fig. 3) and some additional experiments not shown in the paper
dim_cmds = list()
dim_cmds.append(MMLJobDescription(prefix_req=def_reqs, mode='dim', config_options={'tasks': 'pami_shrink_mix',
                                                                                   'proj': projects["base"],
                                                                                   'mode.inv_mle': True}))
mml.interactive.write_out_commands(cmd_list=dim_cmds, name='dimensions')

Stored 1 commands at dimensions.txt.


In [7]:
# convenience function for easier retrieve from cluster results, 
user_id = 'USERNAME'  
# use as print(get_retrieve_for_proj('my_project')) and run the result in a shell to get the results of 'my_project' from cluster to your local system
def get_retrieve_for_proj(proj):
    return f"rsync -rtvu --stats --exclude=PARAMETERS --exclude=hpo --exclude=runs --exclude=FIMS --exclude=FC_TUNED {user_id}@odcf-worker01:{os.getenv('MML_CLUSTER_RESULTS_PATH')}/{proj}/ {os.getenv('MML_RESULTS_PATH')}/{proj}"


# the following optimizes a jobs epochs in a way that target task is seen at least 40 epochs but at max 4000 steps (plus finishing the epoch)
def optimize_epochs(target_task: str, batch_size: int = 300, max_steps: int = 4000, max_epoch: int = 40) -> int:
    return min(max_epoch, (max_steps // ((int(task_infos.num_samples[target_task] * 0.8) // batch_size) + 1)) + 1)

In [8]:
# baselines
# these are the default options for all tasks, they should not be modified without justification
def get_default_config(target_task: str, shrunk: bool = False) -> Dict[str, Union[str, bool, int]]:
    if shrunk:
        epochs = 40
    else:
        epochs = optimize_epochs(target_task=target_task, batch_size=300, max_steps=4000, max_epoch=40)
    default_options = {'tasks': 'pami', 'pivot.name': t, 'mode.cv': False, 'mode.nested': False,
                       'mode.store_parameters': False, 'sampling.balanced': True,
                       'sampling.batch_size': 300, 'callbacks': 'none', 'lr_scheduler': 'step',
                       '+trainer.check_val_every_n_epoch': epochs,
                       'trainer.max_epochs': epochs, 'augmentations': 'baseline256', 'sampling.enable_caching': True}
    return default_options


base_cmds = list()
for ix in range(rerun):
    for t in all_tasks:
        opts = get_default_config(t)
        opts.update({'proj': f'{projects["raw_baseline"]}_{ix}', 'seed': ix, 'mode.store_parameters': True})
        base_cmds.append(MMLJobDescription(prefix_req=def_reqs, mode='train', config_options=opts))
        if t in shrinkable_tasks:
            shrink_opts = get_default_config(t, shrunk=True)
            shrink_opts.update({'proj': f'{projects["raw_shrunk"]}_{ix}', 'tasks': 'pami_shrink'})
            base_cmds.append(MMLJobDescription(prefix_req=def_reqs, mode='train', config_options=shrink_opts))
mml.interactive.write_out_commands(cmd_list=base_cmds, name='baseline')

Stored 381 commands at baseline.txt.


In [33]:
#################################
# EXPERIMENT 1: MODEL TRANSFER  #
#################################
# VRAM requirements for timm architectures
model_transfer_arch_reqs = {
    'tf_efficientnet_b2': 23.0,
    'tf_efficientnet_b2_ap': 24.0,
    'tf_efficientnet_b2_ns': 24.0,
    'tf_efficientnet_cc_b0_4e': 22.0,
    'swsl_resnet50': 20.0,
    'ssl_resnext50_32x4d': 24.0,
    'regnetx_032': 20.5,
    'regnety_032': 22.0,
    'rexnet_100': 20.5,
    'ecaresnet50d': 24.0,
    'cspdarknet53': 23.0,
    'mixnet_l': 25.0,
    'cspresnext50': 24.0,
    'cspresnet50': 18.0,
    'ese_vovnet39b': 25.0,
    'resnest50d': 25.5,
    'hrnet_w18': 24.0,
    'skresnet34': 16.5,
    'mobilenetv3_large_100': 13.5,
    'res2net50_26w_4s': 24.5
}
arch_cmds = list()
for ix in range(rerun):
    for t in source_tasks:
        for arch, vram in model_transfer_arch_reqs.items():
            opts = get_default_config(t)
            opts.update({'proj': f'{projects["arch_search"]}_{ix}',
                         'arch.name': arch, 'seed': ix})
            # the following goes back to a rare occurrence of incompatible singleton batches with some batch_norms
            # avoid this by minimally wiggle batch size
            if t == 'mura_xr_wrist' and arch in ['rexnet_100', 'resnest50d', 'skresnet34']:
                opts.update({'sampling.batch_size': 301})
            if CLUSTER_USAGE:
                arch_reqs = dataclasses.replace(def_reqs, vram_per_gpu=vram)
            else:
                arch_reqs = def_reqs
            arch_cmds.append(MMLJobDescription(prefix_req=arch_reqs, mode='train',
                                               config_options=opts))
mml.interactive.write_out_commands(cmd_list=arch_cmds, name='full_arch', max_cmds=2000)
arch_shrunk_cmds = list()
for ix in range(rerun):
    for t in target_tasks:
        if task_infos.num_classes[t] > 40 or task_infos.num_samples[t] <= 1000:
            continue
        for arch, vram in model_transfer_arch_reqs.items():
            opts = get_default_config(t, shrunk=True)
            opts.update({'proj': f'{projects["arch_infer"]}_{ix}', 'tasks': 'pami_shrink',
                         'arch.classification.id': arch, 'seed': ix})
            if CLUSTER_USAGE:
                arch_reqs = dataclasses.replace(def_reqs, vram_per_gpu=vram)
            else:
                arch_reqs = def_reqs
            arch_shrunk_cmds.append(MMLJobDescription(prefix_req=arch_reqs, mode='train',
                                                         config_options=opts))
mml.interactive.write_out_commands(cmd_list=arch_shrunk_cmds, name='arch_shrunk', max_cmds=2000)

Stored 2000 commands at full_arch_0.txt.
Stored 2000 commands at full_arch_1.txt.
Stored 260 commands at full_arch_2.txt.
Stored 2000 commands at arch_shrunk_0.txt.
Stored 160 commands at arch_shrunk_1.txt.


In [10]:
####################################
# EXPERIMENT 2: TRANSFER LEARNING  #
####################################
trans_cmds = list()
for ix in range(rerun):
    for t in target_tasks:
        # only small tasks are used as targets
        if task_infos.num_classes[t] > 40:
            continue
        for s in get_valid_sources(t):
            mod_task_file = 'pami' if task_infos.num_samples[t] <= 1000 else 'pami_shrink'
            opts = get_default_config(t, shrunk=True)
            opts.update({'proj': f'{projects["transfer"]}_{ix}', 'tasks': mod_task_file,
                         'reuse.models': f'{projects["raw_baseline"]}_{ix}', 'mode.pretrain_task': s,
                         'seed': ix})
            trans_cmds.append(MMLJobDescription(prefix_req=def_reqs, config_options=opts, mode='tl'))
mml.interactive.write_out_commands(cmd_list=trans_cmds, name='transfer', max_cmds=2000)

Stored 2000 commands at transfer_0.txt.
Stored 2000 commands at transfer_1.txt.
Stored 2000 commands at transfer_2.txt.
Stored 2000 commands at transfer_3.txt.
Stored 496 commands at transfer_4.txt.


In [11]:
######################################
# EXPERIMENT 3: AUG POLICY TRANSFER  #
######################################
# Step 1:  training the auto augmentation pipeline for each potential source
if not all([(Path(os.getenv('MML_RESULTS_PATH')) / (projects['aa_search'] + f'_{ix}')).exists() for ix in range(rerun)]):
    raise RuntimeError(f"AA mode is not supported anymore with the recent version of MML, you need to import the following projects manually -> pami2_t_aa_search_01_0, pami2_t_aa_search_01_1 and pami2_t_aa_search_01_2 from the data/auto_augmentations folder. Put these to your MML results folder at {os.getenv('MML_RESULTS_PATH')}.")
# Step 2: evaluating the augmentation pipeline
policy_cmds = list()
for ix in range(rerun):
    for t in target_tasks:
        # only small tasks are used as targets
        if task_infos.num_classes[t] > 40:
            continue
        for s in get_valid_sources(t):
            mod_task_file = 'pami' if task_infos.num_samples[t] <= 1000 else 'pami_shrink'
            opts = get_default_config(t, shrunk=True)
            opts.update({'proj': f'{projects["aa_infer"]}_{ix}', 'tasks': mod_task_file,
                         '+reuse.aa': f'{projects["aa_search"]}_{ix}',
                         'augmentations': 'load_aa_from',
                         'augmentations.source': s, 'seed': ix})
            # note that we use the aatrain mode here to inject the augmentation
            policy_cmds.append(MMLJobDescription(prefix_req=def_reqs, config_options=opts, mode='aatrain'))
mml.interactive.write_out_commands(cmd_list=policy_cmds, name='policy', max_cmds=2000)

Stored 2000 commands at policy_0.txt.
Stored 2000 commands at policy_1.txt.
Stored 2000 commands at policy_2.txt.
Stored 2000 commands at policy_3.txt.
Stored 496 commands at policy_4.txt.


In [12]:
######################################
# EXPERIMENT 4: MULTI-TASK LEARNING  #
######################################
# We did not use full multitask learning with full sized target tasks in the paper (except for small tasks)
multi_cmds = list()
for ix in range(rerun):
    for t in target_tasks:
        for s in get_valid_sources(t):
            opts = get_default_config(t)
            opts.update(
                {
                    'proj': f'{projects["multi_task"]}_{ix}',
                    'mode.multitask': 2,
                    'sampling.balanced': True,
                    'mode.co_tasks': [s],
                    'sampling.sample_num': int(0.8 * task_infos.num_samples[t]),
                    'loss.auto_activate_weighing': False, 'seed': ix})
            multi_cmds.append(MMLJobDescription(prefix_req=def_reqs, config_options=opts, mode='train'))
mml.interactive.write_out_commands(cmd_list=multi_cmds, name='full_multi', max_cmds=2000)

multi_shrunk_cmds = list()
for ix in range(rerun):
    for t in target_tasks:
        # unshrinkable or already covered above
        if task_infos.num_classes[t] > 40 or task_infos.num_samples[t] <= 1000:
            continue
        for s in get_valid_sources(t):
            opts = get_default_config(t, shrunk=True)
            opts.update(
                {'tasks': 'pami_shrink',
                 'proj': f'{projects["multi_shrunk"]}_{ix}',
                 'mode.multitask': 2,
                 'sampling.balanced': True,
                 'mode.co_tasks': [s],
                 'sampling.sample_num': min(int(0.8 * task_infos.num_samples[t]), 800),
                 'loss.auto_activate_weighing': False, 'seed': ix})
            multi_shrunk_cmds.append(MMLJobDescription(prefix_req=def_reqs, config_options=opts, mode='train'))
mml.interactive.write_out_commands(cmd_list=multi_shrunk_cmds, name='multi_shrunk', max_cmds=2000)

Stored 2000 commands at full_multi_0.txt.
Stored 2000 commands at full_multi_1.txt.
Stored 2000 commands at full_multi_2.txt.
Stored 2000 commands at full_multi_3.txt.
Stored 496 commands at full_multi_4.txt.
Stored 2000 commands at multi_shrunk_0.txt.
Stored 2000 commands at multi_shrunk_1.txt.
Stored 2000 commands at multi_shrunk_2.txt.
Stored 1026 commands at multi_shrunk_3.txt.


In [13]:
all_train_cmds = base_cmds + arch_cmds + arch_shrunk_cmds + trans_cmds + policy_cmds + multi_shrunk_cmds
print(f'Our experiments trained {len(all_train_cmds)} models.')

Our experiments trained 30819 models.


In [14]:
# if you want to submit jobs to the cluster or run them locally, consider the runner functionality
# see mml_lsf README instructions on how to set this up 
# the following demonstrates submission of the baseline jobs
if CLUSTER_USAGE:
    from mml_lsf.runner import LSFJobRunner

    runner = LSFJobRunner()
    for job in base_cmds:
        runner.run(job)

In [15]:
# after running all experiments results can be transferred back with these retrieve commands
if CLUSTER_USAGE:
    sync_cmds = list()
    for ix in range(rerun):
        for proj_id in ['multi_task', 'aa_infer', 'transfer', 'arch_search', 'raw_shrunk',
                        'raw_baseline', 'multi_shrunk', 'arch_infer']:
            sync_cmds.append(get_retrieve_for_proj(f'{projects[proj_id]}_{ix}'))
    with open(Path(os.path.abspath('')) / 'output_sync.txt', 'w') as file:
        file.write('\n'.join(sync_cmds))
    print(f'Stored {len(sync_cmds)} commands at output_sync.txt.')

## Feature and FIM extraction

This is how task feature extraction works. Note that full features comprise several GB and are not provided directly (also for licensing compatibility issues). The computed task distances are provided in the `cache` folder top-level.

In [16]:
updated_shrunk_task_list = [t.replace(' --shrink_train 800', '+shrink_train?800') for t in all_tasks_including_shrunk]

features_cmd = MMLJobDescription(prefix_req=def_reqs,
                                 config_options={'task_list': updated_shrunk_task_list, 'proj': 'pami2_features',
                                                 'distance': 'emd', 
                                                 'distance._mode.subroutines': ['feature'], 'augmentations': 'baseline256'},
                                 mode='similarity')
fim_cmd = MMLJobDescription(prefix_req=def_reqs,
                            config_options={'task_list': updated_shrunk_task_list, 'proj': 'pami2_fims_recent', 'distance': 'fed', 
                                            'distance._mode.subroutines': ['tune', 'fim'], 'sampling.sample_num': 8000,
                                            'sampling.balanced': True, 'distance.fim.samples': 2000,
                                            'augmentations': 'baseline256', }, mode='similarity')

In [17]:
# the following demonstrates how to run these locally from within this notebook
# CAUTION: it produces a lot of logging output to the notebook - consider running these commands in the terminal as described below
from mml.interactive import SubprocessJobRunner

local_reqs = DefaultRequirements()
runner = SubprocessJobRunner()
for job in [features_cmd, fim_cmd]:
    job.prefix_req = local_reqs
    # runner.run(job)  # uncomment to run

In [18]:
# want to run in the terminal - follow here
local_reqs = DefaultRequirements()
for job in [features_cmd, fim_cmd]:
    job.prefix_req = local_reqs
features_cmd.render()  # paste the output into terminal (remove surrounding quotes) takes ~20 minutes

"mml emd task_list=['lapgyn4_anatomical_structures','lapgyn4_surgical_actions','lapgyn4_instrument_count','lapgyn4_anatomical_actions','sklin2_skin_lesions','identify_nbi_infframes','laryngeal_tissues','nerthus_bowel_cleansing_quality','stanford_dogs_image_categorization','svhn','caltech101_object_classification','caltech256_object_classification','cifar10_object_classification','cifar100_object_classification','mnist_digit_classification','emnist_digit_classification','hyperkvasir_anatomical-landmarks','hyperkvasir_pathological-findings','hyperkvasir_quality-of-mucosal-views','hyperkvasir_therapeutic-interventions','cholec80_grasper_presence','cholec80_bipolar_presence','cholec80_hook_presence','cholec80_scissors_presence','cholec80_clipper_presence','cholec80_irrigator_presence','cholec80_specimenbag_presence','derm7pt_skin_lesions','idle_action_recognition','barretts_esophagus_diagnosis','brain_tumor_classification','mednode_melanoma_classification','brain_tumor_type_classification'

In [19]:
fim_cmd.render()

"mml fed task_list=['lapgyn4_anatomical_structures','lapgyn4_surgical_actions','lapgyn4_instrument_count','lapgyn4_anatomical_actions','sklin2_skin_lesions','identify_nbi_infframes','laryngeal_tissues','nerthus_bowel_cleansing_quality','stanford_dogs_image_categorization','svhn','caltech101_object_classification','caltech256_object_classification','cifar10_object_classification','cifar100_object_classification','mnist_digit_classification','emnist_digit_classification','hyperkvasir_anatomical-landmarks','hyperkvasir_pathological-findings','hyperkvasir_quality-of-mucosal-views','hyperkvasir_therapeutic-interventions','cholec80_grasper_presence','cholec80_bipolar_presence','cholec80_hook_presence','cholec80_scissors_presence','cholec80_clipper_presence','cholec80_irrigator_presence','cholec80_specimenbag_presence','derm7pt_skin_lesions','idle_action_recognition','barretts_esophagus_diagnosis','brain_tumor_classification','mednode_melanoma_classification','brain_tumor_type_classification'

In [20]:
 my_source = 'aptos19_blindness_detection'
 my_target = 'breast_cancer_classification_v2'
 some_other_task = 'bean_plant_disease_classification'
 assert my_source in get_valid_sources(my_target)
 assert some_other_task in shrinkable_tasks
 from mml_tf.tasks import shrink_map, old_to_new
 my_shrunk_source = old_to_new(shrink_map[my_source])

In [21]:
 create_cmd = prep_cmds[0]                                      # pick the task creation job for all tasks
 create_cmd.config_options['tasks'] = 'none'                    # remove the creation of all tasks
 create_cmd.config_options['task_list'] = [some_other_task]      # set the single task to be created
 runner.run(create_cmd)  

[[36m2025-03-14 12:56:20,418[0m][[34mmml[0m][[32mINFO[0m] - Started MML 1.0.2 on Python 3.10.16 with mode CREATE.[0m
[[36m2025-03-14 12:56:20,419[0m][[34mmml[0m][[32mINFO[0m] - Plugins loaded: ['mml-sql', 'mml-similarity', 'mml-dimensionality', 'mml-tasks', 'mml-tf', 'mml-lsf'][0m
[[36m2025-03-14 12:56:20,629[0m][[34mmml.core.scripts.schedulers.create_scheduler[0m][[32mINFO[0m] - Skipping creation of task bean_plant_disease_classification because there already seems to be a RAW version of that.[0m
[[36m2025-03-14 12:56:20,629[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Executing after init hook: check_lsf_workers[0m
[[36m2025-03-14 12:56:20,629[0m][[34mmml_lsf.workers[0m][[32mINFO[0m] - LSF cluster plugin detected local system, no changes made to the number of workers.[0m
[[36m2025-03-14 12:56:20,630[0m][[34mmml[0m][[32mINFO[0m] - MML init time was 0.0h 0.0m  0.21s.[0m
[[36m2025-03-14 12:56:20,632[0m][[34mmml.core.s

In [22]:
 pp_cmd = prep_cmds[1]                                      # pick the task preprocessing job for all tasks
 pp_cmd.config_options['tasks'] = 'none'                    # remove the preprocessing of all tasks
 pp_cmd.config_options['task_list'] = [some_other_task]      # set the single task to be preprocessed
 runner.run(pp_cmd)                                         # run the job

[[36m2025-03-14 12:56:26,407[0m][[34mmml[0m][[32mINFO[0m] - Started MML 1.0.2 on Python 3.10.16 with mode PP.[0m
[[36m2025-03-14 12:56:26,407[0m][[34mmml[0m][[32mINFO[0m] - Plugins loaded: ['mml-sql', 'mml-similarity', 'mml-dimensionality', 'mml-tasks', 'mml-tf', 'mml-lsf'][0m
[[36m2025-03-14 12:56:26,626[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Executing after init hook: check_lsf_workers[0m
[[36m2025-03-14 12:56:26,627[0m][[34mmml_lsf.workers[0m][[32mINFO[0m] - LSF cluster plugin detected local system, no changes made to the number of workers.[0m
[[36m2025-03-14 12:56:26,629[0m][[34mmml[0m][[32mINFO[0m] - MML init time was 0.0h 0.0m  0.22s.[0m
[[36m2025-03-14 12:56:26,631[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Preparing experiment ...[0m
[[36m2025-03-14 12:56:26,632[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Starting experiment![0m
[[36m2025-03-14 1

In [23]:
 shrink_cmd = prep_cmds[2]                                      # pick the task shrinking job for all tasks
 shrink_cmd.config_options['tasks'] = 'none'                    # remove the shrinking of all tasks
 shrink_cmd.config_options['task_list'] = [old_to_new(shrink_map[some_other_task])]      # set the single task to be shrinked
 runner.run(shrink_cmd)

[[36m2025-03-14 12:56:32,452[0m][[34mmml[0m][[32mINFO[0m] - Started MML 1.0.2 on Python 3.10.16 with mode INFO.[0m
[[36m2025-03-14 12:56:32,452[0m][[34mmml[0m][[32mINFO[0m] - Plugins loaded: ['mml-sql', 'mml-similarity', 'mml-dimensionality', 'mml-tasks', 'mml-tf', 'mml-lsf'][0m
[[36m2025-03-14 12:56:32,665[0m][[34mmml.core.scripts.schedulers.info_scheduler[0m][[32mINFO[0m] - Was given no study name to search for, so showing all studies with project prefix.[0m
[[36m2025-03-14 12:56:32,666[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Executing after init hook: check_lsf_workers[0m
[[36m2025-03-14 12:56:32,666[0m][[34mmml_lsf.workers[0m][[32mINFO[0m] - LSF cluster plugin detected local system, no changes made to the number of workers.[0m
[[36m2025-03-14 12:56:32,668[0m][[34mmml[0m][[32mINFO[0m] - MML init time was 0.0h 0.0m  0.22s.[0m
[[36m2025-03-14 12:56:32,670[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][

Gathering sizes: 100%|██████████| 1034/1034 [00:00<00:00, 17535.77it/s]
Gathering mean and std: 100%|██████████| 11/11 [00:02<00:00,  4.49it/s]
Loading samples: 100%|██████████| 1/1 [00:00<00:00, 56.57it/s]


[[36m2025-03-14 12:56:35,442[0m][[34mmml.core.data_loading.file_manager[0m][[32mINFO[0m] - Writing task description at /home/scholzpa/Pictures/datasets/mml_data/PREPROCESSED/default/DSET_ibean/TASK_bean_plant_disease_classification+shrink_train?800.json.[0m
[[36m2025-03-14 12:56:35,444[0m][[34mmml.core.data_preparation.task_creator[0m][[32mINFO[0m] - Testing the loading of /home/scholzpa/Pictures/datasets/mml_data/PREPROCESSED/default/DSET_ibean/TASK_bean_plant_disease_classification+shrink_train?800.json...[0m
[[36m2025-03-14 12:56:35,453[0m][[34mmml.core.data_preparation.task_creator[0m][[32mINFO[0m] - Testing of /home/scholzpa/Pictures/datasets/mml_data/PREPROCESSED/default/DSET_ibean/TASK_bean_plant_disease_classification+shrink_train?800.json finished, dataset loading time was  0.01 seconds, sample loading time was  0.00 seconds.[0m
[[36m2025-03-14 12:56:35,454[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Starting experiment![0m

In [24]:
 dim_cmd = dim_cmds[0]                                      # pick the dimensionality computing job for all tasks
 dim_cmd.config_options['tasks'] = 'none'                   # remove the computation of all tasks
 dim_cmd.config_options['task_list'] = [some_other_task]    # set the single task to be computed
 runner.run(dim_cmd)                                        # run the job

[[36m2025-03-14 13:06:49,613[0m][[34mmml[0m][[32mINFO[0m] - Started MML 1.0.2 on Python 3.10.16 with mode DIM.[0m
[[36m2025-03-14 13:06:49,613[0m][[34mmml[0m][[32mINFO[0m] - Plugins loaded: ['mml-sql', 'mml-similarity', 'mml-dimensionality', 'mml-tasks', 'mml-tf', 'mml-lsf'][0m
[[36m2025-03-14 13:06:49,910[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Executing after init hook: check_lsf_workers[0m
[[36m2025-03-14 13:06:49,910[0m][[34mmml_lsf.workers[0m][[32mINFO[0m] - LSF cluster plugin detected local system, no changes made to the number of workers.[0m
[[36m2025-03-14 13:06:49,912[0m][[34mmml[0m][[32mINFO[0m] - MML init time was 0.0h 0.0m  0.30s.[0m
[[36m2025-03-14 13:06:49,914[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Preparing experiment ...[0m
[[36m2025-03-14 13:06:49,914[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Starting experiment![0m
[[36m2025-03-14 

Iterate subsets:   0%|          | 0/1 [00:00<?, ?it/s]
Compute KNN:   0%|          | 0/10 [00:00<?, ?it/s][A
Compute KNN:  10%|█         | 1/10 [00:01<00:11,  1.26s/it][A
Compute KNN:  20%|██        | 2/10 [00:02<00:08,  1.01s/it][A
Compute KNN:  30%|███       | 3/10 [00:02<00:06,  1.08it/s][A
Compute KNN:  40%|████      | 4/10 [00:03<00:05,  1.14it/s][A
Compute KNN:  50%|█████     | 5/10 [00:04<00:04,  1.17it/s][A
Compute KNN:  60%|██████    | 6/10 [00:05<00:03,  1.18it/s][A
Compute KNN:  70%|███████   | 7/10 [00:06<00:02,  1.19it/s][A
Compute KNN:  80%|████████  | 8/10 [00:07<00:01,  1.19it/s][A
Compute KNN:  90%|█████████ | 9/10 [00:07<00:00,  1.20it/s][A
Compute KNN: 100%|██████████| 10/10 [00:08<00:00,  1.20it/s][A
Iterate subsets: 100%|██████████| 1/1 [00:08<00:00,  8.93s/it]A


[[36m2025-03-14 13:06:58,866[0m][[34mmml_dimensionality.scripts.dimensionality_scheduler[0m][[32mINFO[0m] - Finished dimensionality estimation for task [33m[46m[1mbean_plant_disease_classification[0m
[[36m2025-03-14 13:06:58,869[0m][[34mmml.core.data_loading.file_manager[0m][[32mINFO[0m] - A total of 1 paths have been created during this run.[0m
[[36m2025-03-14 13:06:58,869[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Successfully finished all experiments![0m
[[36m2025-03-14 13:06:58,869[0m][[34mmml[0m][[32mINFO[0m] - MML run time was 0.0h 0.0m  8.96s.[0m


In [26]:
 base_cmd = [cmd for cmd in base_cmds if cmd.config_options['pivot.name'] == my_target][0]  # pick the baseline computing job for the target task
 base_cmd.config_options['tasks'] = 'none'                   # remove the loading of all tasks
 runner.run(base_cmd)                                         # run the job

[[36m2025-03-14 13:13:38,996[0m][[34mmml[0m][[32mINFO[0m] - Started MML 1.0.2 on Python 3.10.16 with mode TRAIN.[0m
[[36m2025-03-14 13:13:38,997[0m][[34mmml[0m][[32mINFO[0m] - Plugins loaded: ['mml-sql', 'mml-similarity', 'mml-dimensionality', 'mml-tasks', 'mml-tf', 'mml-lsf'][0m
[[36m2025-03-14 13:13:39,210[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Added pivot task breast_cancer_classification_v2 to task_list.[0m
[[36m2025-03-14 13:13:39,210[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Pivot task is [33m[46m[1mbreast_cancer_classification_v2[0m.[0m
[[36m2025-03-14 13:13:39,215[0m][[34mmml.core.scripts.schedulers.base_scheduler[0m][[32mINFO[0m] - Executing after init hook: check_lsf_workers[0m
[[36m2025-03-14 13:13:39,215[0m][[34mmml_lsf.workers[0m][[32mINFO[0m] - LSF cluster plugin detected local system, no changes made to the number of workers.[0m
[[36m2025-03-14 13:13:39,217[0m][[34

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]




Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]



Finding best initial lr: 100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
Learning rate set to 0.006918309709189364


[[36m2025-03-14 13:14:28,640[0m][[34mlightning_fabric.utilities.rank_zero[0m][[32mINFO[0m] - `Trainer.fit` stopped: `max_steps=100` reached.[0m
[[36m2025-03-14 13:14:28,642[0m][[34mlightning_fabric.utilities.rank_zero[0m][[32mINFO[0m] - Restoring states from the checkpoint path at /home/scholzpa/Documents/exp/tf_repro/pami2_raw_03_0/runs/2025-03-14/13-13-38-945239/.lr_find_b070f235-71a1-4bf2-ad9e-5bf946f9c505.ckpt[0m
[[36m2025-03-14 13:14:28,787[0m][[34mlightning_fabric.utilities.rank_zero[0m][[32mINFO[0m] - Restored all states from the checkpoint at /home/scholzpa/Documents/exp/tf_repro/pami2_raw_03_0/runs/2025-03-14/13-13-38-945239/.lr_find_b070f235-71a1-4bf2-ad9e-5bf946f9c505.ckpt[0m


Caching: 100%|██████████| 624/624 [00:00<00:00, 691.76it/s] 
Caching:   0%|          | 0/156 [00:00<?, ?it/s]

[[36m2025-03-14 13:14:30,002[0m][[34mmml.core.data_loading.task_dataset[0m][[32mINFO[0m] - Caching activated for breast_cancer_classification_v2.[0m
[[36m2025-03-14 13:14:30,003[0m][[34mmml.core.data_loading.task_dataset[0m][[32mINFO[0m] - Cached 624 samples.[0m


Caching: 100%|██████████| 156/156 [00:00<00:00, 334.99it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type             | Params | Mode 
-----------------------------------------------------------
0 | model         | TimmGenericModel | 21.3 M | train
1 | criteria      | ModuleDict       | 0      | train
2 | train_metrics | ModuleDict       | 0      | train
3 | val_metrics   | ModuleDict       | 0      | train
4 | test_metrics  | ModuleDict       | 0      | train
5 | train_cms     | ModuleDict       | 0      | train
6 | val_cms       | ModuleDict       | 0      | train
7 | test_cms      | ModuleDict       | 0      | train
-----------------------------------------------------------
21.3 M    Trainable params
0         Non-trainable params
21.3 M    Total params
85.145    Total estimated model params size (MB)
374       Modules in train mode
0         Modules in eval mode


[[36m2025-03-14 13:14:30,469[0m][[34mmml.core.data_loading.task_dataset[0m][[32mINFO[0m] - Caching activated for breast_cancer_classification_v2.[0m
[[36m2025-03-14 13:14:30,469[0m][[34mmml.core.data_loading.task_dataset[0m][[32mINFO[0m] - Cached 156 samples.[0m
[[36m2025-03-14 13:14:30,523[0m][[34mmml.core.models.lightning_single_frame[0m][[32mINFO[0m] - Using learning rate 0.006918309709189364.[0m
Epoch 39: 100%|██████████| 3/3 [00:00<00:00,  3.07it/s, train/loss=0.233, exp=2025-03-14/13-13-38-945239] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s][A
Epoch 39: 100%|██████████| 3/3 [00:02<00:00,  1.05it/s, train/loss=0.233, exp=2025-03-14/13-13-38-945239][[36m2025-03-14 13:15:12,111[0m][[34mlightning_fabric.utilities.rank_zero[0m][[32mINFO[0m] - `Trainer.fit` stopped: 

In [28]:
from mml_tf.tasks import test_tasks
my_target in test_tasks

True

In [31]:
 import random
 arch = random.choice(list(model_transfer_arch_reqs.keys()))  # either use a random architecture or pick one from `model_transfer_arch_reqs`
 arch_cmd = [cmd for cmd in arch_cmds if (cmd.config_options['pivot.name'] == my_target and cmd.config_options['arch.timm.name'] == arch)][0]  # pick the first arch computing job for the target task and selected architecture
 arch_cmd.config_options['tasks'] = 'none'                   # remove the loading of all tasks
 runner.run(arch_cmd)                                         # run the job

Could not override 'arch.timm.name'.
To append to your config use +arch.timm.name=regnetx_032
Key 'timm' is not in struct
    full_key: arch.timm
    object_type=dict

Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.


In [33]:
arch_cmd.render()

'mml train tasks=none pivot.name=breast_cancer_classification_v2 mode.cv=False mode.nested=False mode.store_parameters=False sampling.balanced=True sampling.batch_size=300 callbacks=none lr_scheduler=step +trainer.check_val_every_n_epoch=40 trainer.max_epochs=40 augmentations=baseline256 sampling.enable_caching=True proj=pami2_t_arch_search_02_0 arch.timm.name=regnetx_032 seed=0'