# Best Practices for Using Runners

In [1]:
%pip uninstall -y todd_ai
%pip install ..

Found existing installation: todd-ai 0.4.0
Uninstalling todd-ai-0.4.0:
  Successfully uninstalled todd-ai-0.4.0
Note: you may need to restart the kernel to use updated packages.
Processing /Users/bytedance/Developer/todd
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: todd-ai
  Building wheel for todd-ai (pyproject.toml) ... [?25ldone
[?25h  Created wheel for todd-ai: filename=todd_ai-0.4.0-py3-none-any.whl size=109857 sha256=3e4dbb39f9adbfd52fc9d2bd67d3dbeb0bdaf1d4112c950025805601046ffe74
  Stored in directory: /private/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/pip-ephem-wheel-cache-2hfcgmpx/wheels/15/ef/5a/9fc12e257ce5cef16b333a2ed6c992ff9cbcc9167f7199e6ac
Successfully built todd-ai
Installing collected packages: todd-ai
Successfully installed todd-ai-0.4.0

[1m[[0m

In [2]:
import os
import pathlib
import tempfile
from pprint import pprint
from typing import Any, NoReturn, TypedDict

import todd
import torch
from torch import nn
import torch.nn.functional as F
import torch.utils.data

Memo = dict[str, Any]

[2023-08-28 20:13:39,892 36766:140704306648640][patches.py:7 todd <module>] INFO: `ipdb` is installed. Using it for debugging.


## Preparation

### Models

In [3]:
@todd.ModelRegistry.register()
class RunnerModel(nn.Module):

    def __init__(self) -> None:
        super().__init__()
        self._weight = torch.nn.Parameter(torch.tensor(0.0))

    @property
    def weight(self) -> torch.nn.Parameter:
        return self._weight

    def _forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self._weight

    def forward(
        self,
        runner: todd.runners.BaseRunner,
        batch,
        memo: Memo,
        *args,
        **kwargs,
    ) -> Memo:
        log: dict[str, Any] | None = memo.get('log')
        y = self._forward(batch['x'])
        loss = F.l1_loss(y, batch['y'])
        memo['loss'] = loss
        if log is not None:
            log['batch'] = str(batch)
            log['weight'] = f'{self._weight.item():.3f}'
            log['loss'] = f'{loss:.3f}'
        return memo

### Datasets

In [4]:
class Sample(TypedDict):
    x: int
    y: int

In [5]:
@todd.DatasetRegistry.register()
class RunnerDataset(torch.utils.data.Dataset[int]):

    def __init__(self, n: int) -> None:
        self._data = list(range(1, n + 1))

    def __len__(self) -> int:
        return len(self._data)

    def __getitem__(self, index: int) -> Sample:
        x = self._data[index]
        return Sample(x=x, y=x * 2)

In [6]:
class Batch(TypedDict):
    x: torch.Tensor
    y: torch.Tensor

## Validators

In [7]:
validator_demo = todd.Config(
    type='Validator',
    name='validator',
    dataloader=dict(batch_size=1, dataset=dict(type='RunnerDataset', n=20)),
    strategy=dict(type='BaseStrategy', model=dict(type='RunnerModel')),
)

In [8]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.Validator = todd.RunnerRegistry.build(
        validator_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()
    
    !echo
    !tree $work_dirs

[2m[2023-08-28 20:13:42,082 36766:140704306648640][base.py:54 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmphjs1zw17[0m
└── [1;36mvalidator[0m

2 directories, 0 files


In [9]:
validator_demo.callbacks = dict(type='LogCallback', interval=5)


In [10]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.Validator = todd.RunnerRegistry.build(
        validator_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()
    
    !echo
    !tree $work_dirs

[2m[2023-08-28 20:13:42,415 36766:140704306648640][base.py:54 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:42,419 36766:140704306648640][log.py:84 todd.Validator.validator after_run_iter] INFO: Iter [5/20] ETA 0:00:00 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000
[2023-08-28 20:13:42,421 36766:140704306648640][log.py:84 todd.Validator.validator after_run_iter] INFO: Iter [10/20] ETA 0:00:00 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000
[2023-08-28 20:13:42,424 36766:140704306648640][log.py:84 todd.Validator.validator after_run_iter] INFO: Iter [15/20] ETA 0:00:00 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000
[2023-08-28 20:13:42,426 36766:140704306648640][log.py:84 todd.Validator.validator after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp4cb3xd0q[0m
└── [1;36mvalidator[0m

2 directories, 0 files


## Trainers

In [11]:
trainer_demo = validator_demo.copy()
trainer_demo.pop('type')
trainer_demo.dataloader = todd.Config(
    batch_size=2,
    shuffle=True,
    dataset=dict(type='RunnerDataset', n=67),
)
trainer_demo.optimizer = todd.Config(type='SGD', lr=0.005)


### Iteration Based

In [12]:
iter_based_trainer_demo = trainer_demo.copy()
iter_based_trainer_demo.type = 'IterBasedTrainer'
iter_based_trainer_demo.name = 'iter_based_trainer'
iter_based_trainer_demo.iters = 53

In [13]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.IterBasedTrainer = todd.RunnerRegistry.build(
        iter_based_trainer_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2023-08-28 20:13:42,726 36766:140704306648640][base.py:54 todd.IterBasedTrainer.iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:42,729 36766:140704306648640][log.py:84 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [5/53] ETA 0:00:00 batch={'x': tensor([44,  8]), 'y': tensor([88, 16])} weight=0.000 loss=52.000
[2023-08-28 20:13:42,732 36766:140704306648640][log.py:84 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [10/53] ETA 0:00:00 batch={'x': tensor([ 6, 53]), 'y': tensor([ 12, 106])} weight=0.000 loss=59.000
[2023-08-28 20:13:42,734 36766:140704306648640][log.py:84 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [15/53] ETA 0:00:00 batch={'x': tensor([18, 48]), 'y': tensor([36, 96])} weight=0.000 loss=66.000
[2023-08-28 20:13:42,736 36766:140704306648640][log.py:84 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [20/53] ETA 0:00:00 batch={'x': 

### Epoch Based

In [14]:
epoch_based_trainer_demo = trainer_demo.copy()
epoch_based_trainer_demo.type = 'EpochBasedTrainer'
epoch_based_trainer_demo.name = 'epoch_based_trainer'
epoch_based_trainer_demo.epochs = 3

In [15]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.EpochBasedTrainer = todd.RunnerRegistry.build(
        epoch_based_trainer_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2023-08-28 20:13:42,768 36766:140704306648640][base.py:54 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:42,768 36766:140704306648640][log.py:90 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-08-28 20:13:42,772 36766:140704306648640][log.py:84 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [5/102] ETA 0:00:00 batch={'x': tensor([61, 17]), 'y': tensor([122,  34])} weight=0.000 loss=78.000
[2023-08-28 20:13:42,774 36766:140704306648640][log.py:84 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [10/102] ETA 0:00:00 batch={'x': tensor([16, 25]), 'y': tensor([32, 50])} weight=0.000 loss=41.000
[2023-08-28 20:13:42,777 36766:140704306648640][log.py:84 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [15/102] ETA 0:00:00 batch={'x': tensor([10, 21]), 'y': tensor([20, 42])} weight=0.000 loss=31.000
[2023-08-2

## Callbacks

### Log

In [16]:
log_callback_demo = validator_demo.copy()
log_callback = log_callback_demo.callbacks
log_callback.collect_env = todd.Config(verbose=False)
log_callback.with_file_handler = True
log_callback_demo.callbacks = [log_callback]

In [17]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.Validator = todd.RunnerRegistry.build(
        log_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo
    !cat {work_dirs}/validator/*.log

[2023-08-28 20:13:42,999 36766:140704306648640][log.py:50 todd.Validator.validator init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:43,001 36766:140704306648640][base.py:54 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:43,005 36766:140704306648640][log.py:84 todd.Validator.validator after_run_iter] INFO: Iter [5/20] ETA 0:00:00 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000
[2023-08-28 20:


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpsdjb1c5d[0m
└── [1;36mvalidator[0m
    └── 2023-08-28T20-13-42_838268-08-00.log

2 directories, 1 file

[2023-08-28 20:13:42,999 36766:140704306648640][log.py:50 todd.Validator.validator init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2023-08-28 20:13:43,001 36766:140704306648640][base.py:54 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R
[2023-08-28 20:13:43,005 36766:140704306648640][log.p

### Optimize

In [18]:
optimize_callback_demo = iter_based_trainer_demo.copy()
optimize_callback = todd.Config(type='OptimizeCallback')
optimize_callback_demo.callbacks = [optimize_callback, log_callback]

In [19]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.IterBasedTrainer = todd.RunnerRegistry.build(
        optimize_callback_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2023-08-28 20:13:43,661 36766:140704306648640][log.py:50 todd.IterBasedTrainer.iter_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:43,663 36766:140704306648640][base.py:54 todd.IterBasedTrainer.iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:43,668 36766:140704306648640][log.py:84 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [5/53] ETA 0:00:00 batch={'x': tensor([33, 42]), 'y': t

### Learning Rate Schedule

In [20]:
lr_schedule_callback_demo = iter_based_trainer_demo.copy()
lr_schedule_callback = todd.Config(
    type='LRScheduleCallback',
    lr_scheduler=dict(type='LinearLR', total_iters=10),
)
lr_schedule_callback_demo.callbacks = [
    optimize_callback,
    lr_schedule_callback,
    log_callback,
]


In [21]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.IterBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_callback_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2023-08-28 20:13:43,810 36766:140704306648640][log.py:50 todd.IterBasedTrainer.iter_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:43,812 36766:140704306648640][base.py:54 todd.IterBasedTrainer.iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:43,817 36766:140704306648640][log.py:84 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [5/53] ETA 0:00:00 batch={'x': tensor([51, 62]), 'y': t

In [22]:
lr_schedule_by_epoch_callback_demo = epoch_based_trainer_demo.copy()
lr_schedule_by_epoch_callback = lr_schedule_callback.copy()
lr_schedule_by_epoch_callback.by_epoch = True
lr_schedule_by_epoch_callback_demo.callbacks = [
    optimize_callback,
    lr_schedule_by_epoch_callback,
    log_callback,
]


In [23]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.EpochBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_by_epoch_callback_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2023-08-28 20:13:43,994 36766:140704306648640][log.py:50 todd.EpochBasedTrainer.epoch_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:43,996 36766:140704306648640][base.py:54 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:43,997 36766:140704306648640][log.py:90 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-08-28 20:13:44,002 36766:140704306

### Learning Rate Scaler

In [24]:
lr_scaler_callback_demo = iter_based_trainer_demo.copy()
lr_scaler_callback = todd.Config(
    type='LRScaleCallback',
    lr_scaler=dict(base_batch_size=1),
)
lr_scaler_callback_demo.callbacks = [
    optimize_callback,
    lr_scaler_callback,
    log_callback,
]

In [25]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.IterBasedTrainer = todd.RunnerRegistry.build(
        lr_scaler_callback_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2023-08-28 20:13:44,089 36766:140704306648640][lr.py:93 todd.IterBasedTrainer.iter_based_trainer _scale_lr] INFO: base_batch_size=1 batch_size=2 lr_scaler=2.000
[2023-08-28 20:13:44,182 36766:140704306648640][log.py:50 todd.IterBasedTrainer.iter_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:44,184 36766:140704306648640][base.py:54 todd.IterBasedTrainer.iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:4

### Checkpoint

In [26]:
checkpoint_callback_demo = iter_based_trainer_demo.copy()
checkpoint_callback = todd.Config(type='CheckpointCallback', interval=10)
checkpoint_callback_demo.callbacks = [checkpoint_callback, log_callback]

In [27]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.IterBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    iter_50 = pathlib.Path(work_dirs) / 'iter_based_trainer' / 'checkpoints' / 'iter_50'
    for f in iter_50.glob('*.pth'):
        print(f"{f.name}:")
        pprint(torch.load(f, 'cpu'))
        print()

[2023-08-28 20:13:44,340 36766:140704306648640][log.py:50 todd.IterBasedTrainer.iter_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:44,343 36766:140704306648640][base.py:54 todd.IterBasedTrainer.iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:44,347 36766:140704306648640][log.py:84 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [5/53] ETA 0:00:00 batch={'x': tensor([61, 14]), 'y': t


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp574yfv_q[0m
└── [1;36miter_based_trainer[0m
    ├── 2023-08-28T20-13-44_247631-08-00.log
    └── [1;36mcheckpoints[0m
        ├── [1;36miter_10[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_20[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_30[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_40[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_50[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │  

In [28]:
checkpoint_by_epoch_callback_demo = epoch_based_trainer_demo.copy()
checkpoint_by_epoch_callback = checkpoint_callback.copy()
checkpoint_by_epoch_callback.update(interval=1, by_epoch=True)
checkpoint_by_epoch_callback_demo.callbacks = [
    checkpoint_by_epoch_callback,
    log_callback,
]

In [29]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.EpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_by_epoch_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    epoch_2 = pathlib.Path(work_dirs) / 'epoch_based_trainer' / 'checkpoints' / 'epoch_2'
    for f in epoch_2.glob('*.pth'):
        print(f"{f.name}:")
        pprint(torch.load(f, 'cpu'))
        print()

[2023-08-28 20:13:44,969 36766:140704306648640][log.py:50 todd.EpochBasedTrainer.epoch_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:44,971 36766:140704306648640][base.py:54 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:44,973 36766:140704306648640][log.py:90 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-08-28 20:13:44,978 36766:140704306


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpbtgrtc_u[0m
└── [1;36mepoch_based_trainer[0m
    ├── 2023-08-28T20-13-44_874198-08-00.log
    └── [1;36mcheckpoints[0m
        ├── [1;36mepoch_1[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36mepoch_2[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36mepoch_3[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        └── [35mlatest[0m -> [1;36mepoch_3[0m

7 directories, 16 files

strategy.pth:
{}

optim.pth:
{'param_groups': [{'dampening': 0,
                   'differentiable': False,
                   'foreach': None,
                   'lr': 0.005,
                   'maximize': False,
                   'momen

In [30]:
checkpoint_load_from_callback_demo = checkpoint_by_epoch_callback_demo.copy()
checkpoint_load_from_callback_demo.callbacks = [
    optimize_callback,
    checkpoint_by_epoch_callback,
    log_callback,
]

In [31]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.EpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_load_from_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !echo {'-' * 20}
    !echo

    runner: todd.runners.EpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_load_from_callback_demo, 
        work_dir=dict(root=work_dirs),
        load_from=os.path.join(work_dirs, 'epoch_based_trainer', 'checkpoints', 'epoch_2')
    )
    runner.run()

[2023-08-28 20:13:45,567 36766:140704306648640][log.py:50 todd.EpochBasedTrainer.epoch_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:45,569 36766:140704306648640][base.py:54 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:45,571 36766:140704306648640][log.py:90 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-08-28 20:13:45,576 36766:140704306


--------------------



[2023-08-28 20:13:46,099 36766:140704306648640][checkpoint.py:46 todd.EpochBasedTrainer.epoch_based_trainer init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpq64o21fc/epoch_based_trainer/checkpoints/epoch_2
[2023-08-28 20:13:46,103 36766:140704306648640][base.py:62 todd.EpochBasedTrainer.epoch_based_trainer load_model_state_dict] INFO: <All keys matched successfully>
[2023-08-28 20:13:46,191 36766:140704306648640][log.py:50 todd.EpochBasedTrainer.epoch_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd

### Monitor

In [32]:
class CustomError(RuntimeError):
    pass

In [33]:
@todd.RunnerRegistry.register()
class FaultyValidator(todd.runners.Validator):

    def _run_iter(self, *args, **kwargs) -> NoReturn:
        raise CustomError('faulty runner')

In [34]:
monitor_callback_demo = validator_demo.copy()
monitor_callback_demo.type = 'FaultyValidator'
monitor_callback = todd.Config(type='MonitorCallback')
monitor_callback_demo.callbacks = [monitor_callback, log_callback]

In [35]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.Validator = todd.RunnerRegistry.build(
        monitor_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    try:
        runner.run()
    except CustomError as e:
        pass

    !echo
    !cat {work_dirs}/validator/*.log

[2023-08-28 20:13:46,361 36766:140704306648640][log.py:50 todd.FaultyValidator.validator init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:46,363 36766:140704306648640][base.py:54 todd.FaultyValidator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[1;31m[2023-08-28 20:13:46,366 36766:140704306648640][monitor.py:28 todd.FaultyValidator.validator __exit__] ERROR: Unable to run iter_=1
batch={'x': tensor([1]), 'y': tensor([2])}
memo={'dataloader':


[2023-08-28 20:13:46,361 36766:140704306648640][log.py:50 todd.FaultyValidator.validator init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2023-08-28 20:13:46,363 36766:140704306648640][base.py:54 todd.FaultyValidator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R
[2023-08-28 20:13:46,366 36766:140704306648640][monitor.py:28 todd.FaultyValidator.validator __exit__] ERROR: Unable to run iter_=1
batch={'x': tensor([1]), 'y': tensor([2])}
memo={'dataloader': <torch.utils

### Priorities

## Strategies

In [36]:
strategy_load_model_from_demo = checkpoint_load_from_callback_demo.copy()

In [37]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.EpochBasedTrainer = todd.RunnerRegistry.build(
        strategy_load_model_from_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !echo {'-' * 20}
    !echo

    runner: todd.runners.EpochBasedTrainer = todd.RunnerRegistry.build(
        strategy_load_model_from_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.strategy.load_model_from(os.path.join(work_dirs, 'epoch_based_trainer', 'checkpoints', 'epoch_2', 'model.pth'))
    runner.run()

[2023-08-28 20:13:46,753 36766:140704306648640][log.py:50 todd.EpochBasedTrainer.epoch_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:46,755 36766:140704306648640][base.py:54 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:46,756 36766:140704306648640][log.py:90 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-08-28 20:13:46,762 36766:140704306


--------------------



[2023-08-28 20:13:47,331 36766:140704306648640][log.py:50 todd.EpochBasedTrainer.epoch_based_trainer init] INFO: 
Platform: macOS-13.4.1
NVIDIA SMI: None
Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
PyTorch version: 2.0.1
TorchVision version: 0.15.2
OpenCV version: 4.7.0
Todd version: 0.4.0
CUDA_HOME: None
Git commit ID: dcd00d8
Git status: 
M todd/runners/base.py
 M todd/runners/callbacks/optimize.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/trainer.py
 M todd/runners/validator.py
 M tutorials/runners.ipynb
[2m[2023-08-28 20:13:47,333 36766:140704306648640][base.py:54 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-08-28 20:13:47,334 36766:140704306648640][base.py:65 todd.EpochBasedTrainer.epoch_based_trainer load_model_from] INFO: Loading model from /var/folders/v_/1kkfntxs5z74_rwvy

## Dry Run

In [38]:
todd.Store.DRY_RUN = True