# Best Practices for Using Runners

In [1]:
%pip uninstall -y todd_ai
%pip install ..

Found existing installation: todd-ai 0.3.0
Uninstalling todd-ai-0.3.0:
  Successfully uninstalled todd-ai-0.3.0
Note: you may need to restart the kernel to use updated packages.
Processing /Users/bytedance/Developer/todd
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: todd-ai
  Building wheel for todd-ai (pyproject.toml) ... [?25ldone
[?25h  Created wheel for todd-ai: filename=todd_ai-0.3.0-py3-none-any.whl size=98667 sha256=e1ef39dbca2c4b544b30f5d76e8fa381708e8eeccfa3c2f5e93ae71e4817f087
  Stored in directory: /private/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/pip-ephem-wheel-cache-izw26a6y/wheels/15/ef/5a/9fc12e257ce5cef16b333a2ed6c992ff9cbcc9167f7199e6ac
Successfully built todd-ai
Installing collected packages: todd-ai
Successfully installed todd-ai-0.3.0

[1m[[0m

In [2]:
import os
import pathlib
import tempfile
from pprint import pprint
from typing import Any, NoReturn, TypedDict, cast

import todd
import torch
import torch.nn.functional as F
import torch.utils.data

Memo = dict[str, Any]

[2023-07-21 11:00:06,802 22882:140704357856832][patches.py:72 todd <module>] INFO: `ipdb` is installed. Using it for debugging.


## Preparation

### Models

In [3]:
@todd.ModelRegistry.register()
class RunnerModel(todd.Module):

    def __init__(self) -> None:
        super().__init__()
        self._weight = torch.nn.Parameter(torch.tensor(0.0))

    @property
    def weight(self) -> torch.nn.Parameter:
        return self._weight

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self._weight

### Datasets

In [4]:
class Sample(TypedDict):
    x: int
    y: int

In [5]:
@todd.DatasetRegistry.register()
class RunnerDataset(torch.utils.data.Dataset[int]):

    def __init__(self, n: int) -> None:
        self._data = list(range(1, n + 1))

    def __len__(self) -> int:
        return len(self._data)

    def __getitem__(self, index: int) -> Sample:
        x = self._data[index]
        return Sample(x=x, y=x * 2)

In [6]:
class Batch(TypedDict):
    x: torch.Tensor
    y: torch.Tensor

### Runners

In [7]:
class RunnerMixin(todd.runners.BaseRunner):

    def _run_iter(self, batch: Batch, memo: Memo) -> Memo:
        y: torch.Tensor = self._strategy.model(batch['x'])
        loss = F.l1_loss(y, batch['y'])
        memo['loss'] = loss
        if 'log' in memo:
            memo['log']['loss'] = f'{loss.item():.3f}'
        return memo

In [8]:
class TrainerMixin(RunnerMixin):

    def _run_iter(self, batch: Batch, memo: Memo) -> Memo:
        memo = super()._run_iter(batch, memo)
        if 'log' in memo:
            model = cast(RunnerModel, self._strategy.module)
            memo['log']['weight'] = f'{model.weight.item():.3f}'
            memo['log']['batch'] = str(batch)
        return memo

In [9]:
@todd.RunnerRegistry.register()
class CustomValidator(RunnerMixin, todd.runners.Validator):
    pass

In [10]:
@todd.RunnerRegistry.register()
class CustomIterBasedTrainer(TrainerMixin, todd.runners.IterBasedTrainer):
    pass

In [11]:
@todd.RunnerRegistry.register()
class CustomEpochBasedTrainer(TrainerMixin, todd.runners.EpochBasedTrainer):
    pass

## Validators

In [12]:
validator_demo = todd.Config(
    type='CustomValidator',
    name='custom_validator',
    dataloader=dict(batch_size=1, dataset=dict(type='RunnerDataset', n=20)),
    strategy=dict(type='BaseStrategy', model=dict(type='RunnerModel')),
    callbacks=dict(type='LogCallback', interval=5),
)

In [13]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomValidator = todd.RunnerRegistry.build(
        validator_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()
    
    !echo
    !tree $work_dirs

[2m[2023-07-21 11:00:07,640 22882:140704357856832][runners.py:61 todd.CustomValidator.custom_validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:07,643 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-21 11:00:07,645 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-21 11:00:07,650 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-21 11:00:07,652 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpgr0zcfsa[0m
└── [1;36mcustom_validator[0m

2 directories, 0 files


## Trainers

In [14]:
trainer_demo = validator_demo.copy()
trainer_demo.pop('type')
trainer_demo.dataloader = todd.Config(
    batch_size=2,
    shuffle=True,
    dataset=dict(type='RunnerDataset', n=67),
)
trainer_demo.optimizer = todd.Config(type='SGD', lr=0.005)


### Iteration Based

In [15]:
iter_based_trainer_demo = trainer_demo.copy()
iter_based_trainer_demo.type = 'CustomIterBasedTrainer'
iter_based_trainer_demo.name = 'custom_iter_based_trainer'
iter_based_trainer_demo.iters = 53

In [16]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        iter_based_trainer_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2023-07-21 11:00:07,975 22882:140704357856832][runners.py:61 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:07,979 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=33.000 weight=0.000 batch={'x': tensor([13, 20]), 'y': tensor([26, 40])}
[2023-07-21 11:00:07,983 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=115.000 weight=0.000 batch={'x': tensor([55, 60]), 'y': tensor([110, 120])}
[2023-07-21 11:00:07,986 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=42.000 weight=0.000 batch={'x': tensor([14, 28]), 'y': tensor([28, 56])}
[2023-07-21 11:00:07,989 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter

### Epoch Based

In [17]:
epoch_based_trainer_demo = trainer_demo.copy()
epoch_based_trainer_demo.type = 'CustomEpochBasedTrainer'
epoch_based_trainer_demo.name = 'custom_epoch_based_trainer'
epoch_based_trainer_demo.epochs = 3

In [18]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        epoch_based_trainer_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2023-07-21 11:00:08,031 22882:140704357856832][runners.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:08,032 22882:140704357856832][log.py:55 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-21 11:00:08,035 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=78.000 weight=0.000 batch={'x': tensor([60, 18]), 'y': tensor([120,  36])}
[2023-07-21 11:00:08,038 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=75.000 weight=0.000 batch={'x': tensor([12, 63]), 'y': tensor([ 24, 126])}
[2023-07-21 11:00:08,041 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=80.000 weight=0.000 batch={'x': tensor([37, 43]),

## Callbacks

### Log

In [19]:
log_callback_demo = validator_demo.copy()
log_callback = log_callback_demo.callbacks
log_callback.with_file_handler = True
log_callback_demo.callbacks = [log_callback]

In [20]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomValidator = todd.RunnerRegistry.build(
        log_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo
    !cat {work_dirs}/custom_validator/*.log

[2m[2023-07-21 11:00:08,103 22882:140704357856832][runners.py:61 todd.CustomValidator.custom_validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:08,105 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-21 11:00:08,107 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-21 11:00:08,109 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-21 11:00:08,112 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpajts7xqq[0m
└── [1;36mcustom_validator[0m
    └── 2023-07-21T11-00-08_102390-08-00.log

2 directories, 1 file

[2023-07-21 11:00:08,103 22882:140704357856832][runners.py:61 todd.CustomValidator.custom_validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R
[2023-07-21 11:00:08,105 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-21 11:00:08,107 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-21 11:00:08,109 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-21 11:00:08,112 22882:140704357856832][log.py:49 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000


### Optimize

In [21]:
optimize_callback_demo = iter_based_trainer_demo.copy()
optimize_callback = todd.Config(type='OptimizeCallback')
optimize_callback_demo.callbacks = [optimize_callback, log_callback]

In [22]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        optimize_callback_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2023-07-21 11:00:08,712 22882:140704357856832][runners.py:61 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:08,721 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=29.725 weight=0.550 batch={'x': tensor([13, 28]), 'y': tensor([26, 56])}
[2023-07-21 11:00:08,727 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=28.350 weight=1.528 batch={'x': tensor([63, 57]), 'y': tensor([126, 114])}
[2023-07-21 11:00:08,739 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=2.870 weight=2.102 batch={'x': tensor([32, 24]), 'y': tensor([64, 48])}
[2023-07-21 11:00:08,756 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [

### Learning Rate Schedule

In [23]:
lr_schedule_callback_demo = iter_based_trainer_demo.copy()
lr_schedule_callback = todd.Config(
    type='LRScheduleCallback',
    lr_scheduler=dict(type='LinearLR', total_iters=10),
)
lr_schedule_callback_demo.callbacks = [
    optimize_callback,
    lr_schedule_callback,
    log_callback,
]


In [24]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_callback_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2023-07-21 11:00:08,804 22882:140704357856832][runners.py:61 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:08,809 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=45.101 weight=0.231 batch={'x': tensor([43,  8]), 'y': tensor([86, 16])} lr=['3.333e-03']
[2023-07-21 11:00:08,812 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=39.138 weight=0.814 batch={'x': tensor([44, 22]), 'y': tensor([88, 44])} lr=['5.000e-03']
[2023-07-21 11:00:08,817 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=6.675 weight=1.733 batch={'x': tensor([45,  5]), 'y': tensor([90, 10])} lr=['5.000e-03']
[2023-07-21 11:00:08,820 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custo

In [25]:
lr_schedule_by_epoch_callback_demo = epoch_based_trainer_demo.copy()
lr_schedule_by_epoch_callback = lr_schedule_callback.copy()
lr_schedule_by_epoch_callback.by_epoch = True
lr_schedule_by_epoch_callback_demo.callbacks = [
    optimize_callback,
    lr_schedule_by_epoch_callback,
    log_callback,
]


In [26]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_by_epoch_callback_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2023-07-21 11:00:08,868 22882:140704357856832][runners.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:08,869 22882:140704357856832][log.py:55 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-21 11:00:08,874 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=17.684 weight=0.316 batch={'x': tensor([16,  5]), 'y': tensor([32, 10])} lr=['1.667e-03']
[2023-07-21 11:00:08,880 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=88.160 weight=0.480 batch={'x': tensor([62, 54]), 'y': tensor([124, 108])} lr=['1.667e-03']
[2023-07-21 11:00:08,884 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=39.157 weight=0.8

### Learning Rate Scaler

In [27]:
lr_scaler_callback_demo = iter_based_trainer_demo.copy()
lr_scaler_callback = todd.Config(
    type='LRScaleCallback',
    lr_scaler=dict(base_batch_size=1),
)
lr_scaler_callback_demo.callbacks = [
    optimize_callback,
    lr_scaler_callback,
    log_callback,
]

In [28]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        lr_scaler_callback_demo,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2023-07-21 11:00:09,044 22882:140704357856832][lr.py:92 todd.CustomIterBasedTrainer.custom_iter_based_trainer _scale_lr] INFO: base_batch_size=1 batch_size=2 lr_scaler=2.000
[2m[2023-07-21 11:00:09,068 22882:140704357856832][runners.py:61 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:09,079 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=2.325 weight=1.690 batch={'x': tensor([14,  1]), 'y': tensor([28,  2])}
[2023-07-21 11:00:09,087 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=3.485 weight=1.915 batch={'x': tensor([25, 57]), 'y': tensor([ 50, 114])}
[2023-07-21 11:00:09,092 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=7.920 weight=1.835 batch={'x': tensor([

### Checkpoint

In [29]:
checkpoint_callback_demo = iter_based_trainer_demo.copy()
checkpoint_callback = todd.Config(type='CheckpointCallback', interval=10)
checkpoint_callback_demo.callbacks = [checkpoint_callback, log_callback]

In [30]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    iter_50 = pathlib.Path(work_dirs) / 'custom_iter_based_trainer' / 'checkpoints' / 'iter_50'
    for f in iter_50.glob('*.pth'):
        print(f"{f.name}:")
        pprint(torch.load(f, 'cpu'))
        print()

[2m[2023-07-21 11:00:09,176 22882:140704357856832][runners.py:61 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:09,182 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=90.000 weight=0.000 batch={'x': tensor([67, 23]), 'y': tensor([134,  46])}
[2023-07-21 11:00:09,189 22882:140704357856832][checkpoint.py:60 todd.CustomIterBasedTrainer.custom_iter_based_trainer _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpjdkh3gfw/custom_iter_based_trainer/checkpoints/iter_10
[2023-07-21 11:00:09,195 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=100.000 weight=0.000 batch={'x': tensor([62, 38]), 'y': tensor([124,  76])}
[2023-07-21 11:00:09,202 22882:140704357856832][log.py:49 todd.CustomIterBasedTrainer.custom_iter_bas


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpjdkh3gfw[0m
└── [1;36mcustom_iter_based_trainer[0m
    ├── 2023-07-21T11-00-09_175601-08-00.log
    └── [1;36mcheckpoints[0m
        ├── [1;36miter_10[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_20[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_30[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_40[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_50[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
    

In [31]:
checkpoint_by_epoch_callback_demo = epoch_based_trainer_demo.copy()
checkpoint_by_epoch_callback = checkpoint_callback.copy()
checkpoint_by_epoch_callback.update(interval=1, by_epoch=True)
checkpoint_by_epoch_callback_demo.callbacks = [
    checkpoint_by_epoch_callback,
    log_callback,
]

In [32]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_by_epoch_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    epoch_2 = pathlib.Path(work_dirs) / 'custom_epoch_based_trainer' / 'checkpoints' / 'epoch_2'
    for f in epoch_2.glob('*.pth'):
        print(f"{f.name}:")
        pprint(torch.load(f, 'cpu'))
        print()

[2m[2023-07-21 11:00:09,771 22882:140704357856832][runners.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:09,773 22882:140704357856832][log.py:55 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-21 11:00:09,778 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=124.000 weight=0.000 batch={'x': tensor([61, 63]), 'y': tensor([122, 126])}
[2023-07-21 11:00:09,786 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=70.000 weight=0.000 batch={'x': tensor([48, 22]), 'y': tensor([96, 44])}
[2023-07-21 11:00:09,791 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=43.000 weight=0.000 batch={'x': tensor([ 5, 38]), 

[2023-07-21 11:00:09,863 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [90/102] loss=58.000 weight=0.000 batch={'x': tensor([37, 21]), 'y': tensor([74, 42])}
[2023-07-21 11:00:09,867 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [95/102] loss=39.000 weight=0.000 batch={'x': tensor([16, 23]), 'y': tensor([32, 46])}
[2023-07-21 11:00:09,871 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [100/102] loss=52.000 weight=0.000 batch={'x': tensor([41, 11]), 'y': tensor([82, 22])}
[2023-07-21 11:00:09,873 22882:140704357856832][checkpoint.py:60 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpl3kortw4/custom_epoch_based_trainer/checkpoints/epoch_3
[2023-07-21 11:00:09,879 22882:140704357856832][chec


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpl3kortw4[0m
└── [1;36mcustom_epoch_based_trainer[0m
    ├── 2023-07-21T11-00-09_770773-08-00.log
    └── [1;36mcheckpoints[0m
        ├── [1;36mepoch_1[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36mepoch_2[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36mepoch_3[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        └── [1;36mlatest[0m
            ├── callbacks.pth
            ├── meta.pth
            ├── model.pth
            ├── optim.pth
            └── strategy.pth

7 directories, 21 files

strategy.pth:
{}

optim.pth:
{'param_groups': [{'dampening': 0,
                   'differentiable': False,
        

In [33]:
checkpoint_load_from_callback_demo = checkpoint_by_epoch_callback_demo.copy()
checkpoint_load_from_callback_demo.callbacks = [
    optimize_callback,
    checkpoint_by_epoch_callback,
    log_callback,
]

In [34]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_load_from_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !echo {'-' * 20}
    !echo

    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_load_from_callback_demo, 
        work_dir=dict(root=work_dirs),
        load_from=os.path.join(work_dirs, 'custom_epoch_based_trainer', 'checkpoints', 'epoch_2')
    )
    runner.run()

[2m[2023-07-21 11:00:10,412 22882:140704357856832][runners.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:10,414 22882:140704357856832][log.py:55 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-21 11:00:10,420 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=15.600 weight=0.800 batch={'x': tensor([ 6, 20]), 'y': tensor([12, 40])}
[2023-07-21 11:00:10,426 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=15.159 weight=1.547 batch={'x': tensor([31, 36]), 'y': tensor([62, 72])}
[2023-07-21 11:00:10,433 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=7.250 weight=2.145 batch={'x': tensor([44, 56]), 'y':

[2023-07-21 11:00:10,551 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [80/102] loss=2.700 weight=1.910 batch={'x': tensor([27, 33]), 'y': tensor([54, 66])}
[2023-07-21 11:00:10,558 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [85/102] loss=4.575 weight=1.850 batch={'x': tensor([ 3, 58]), 'y': tensor([  6, 116])}
[2023-07-21 11:00:10,564 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [90/102] loss=0.035 weight=2.002 batch={'x': tensor([ 4, 24]), 'y': tensor([ 8, 48])}
[2023-07-21 11:00:10,571 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [95/102] loss=6.037 weight=2.115 batch={'x': tensor([59, 46]), 'y': tensor([118,  92])}
[2023-07-21 11:00:10,577 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.c


--------------------



[2023-07-21 11:00:11,018 22882:140704357856832][checkpoint.py:45 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer connect] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpty_5xt_v/custom_epoch_based_trainer/checkpoints/epoch_2
[2m[2023-07-21 11:00:11,022 22882:140704357856832][runners.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:11,023 22882:140704357856832][log.py:55 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [3/3]
[2023-07-21 11:00:11,027 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [70/102] loss=9.315 weight=2.172 batch={'x': tensor([61, 47]), 'y': tensor([122,  94])}
[2023-07-21 11:00:11,029 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [75/102] loss=3.251 weight=1.872 bat

### Monitor

In [35]:
class CustomError(RuntimeError):
    pass

In [36]:
class FaultyRunnerMixin(todd.runners.BaseRunner):

    def _run_iter(self, *args, **kwargs) -> NoReturn:
        raise CustomError('faulty runner')

In [37]:
@todd.RunnerRegistry.register()
class FaultyValidator(FaultyRunnerMixin, todd.runners.Validator):
    pass

In [38]:
@todd.RunnerRegistry.register()
class FaultyIterBasedTrainer(FaultyRunnerMixin, todd.runners.IterBasedTrainer):
    pass

In [39]:
@todd.RunnerRegistry.register()
class FaultyEpochBasedTrainer(
    FaultyRunnerMixin,
    todd.runners.EpochBasedTrainer,
):
    pass

In [40]:
monitor_callback_demo = validator_demo.copy()
monitor_callback_demo.type = 'FaultyValidator'
monitor_callback = todd.Config(type='MonitorCallback')
monitor_callback_demo.callbacks = [monitor_callback, log_callback]

In [41]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomValidator = todd.RunnerRegistry.build(
        monitor_callback_demo, 
        work_dir=dict(root=work_dirs),
    )
    try:
        runner.run()
    except CustomError as e:
        pass

    !echo
    !cat {work_dirs}/custom_validator/*.log

[2m[2023-07-21 11:00:11,109 22882:140704357856832][runners.py:61 todd.FaultyValidator.custom_validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[1;31m[2023-07-21 11:00:11,111 22882:140704357856832][monitor.py:29 todd.FaultyValidator.custom_validator __exit__] ERROR: Unable to run iter_=1
batch={'x': tensor([1]), 'y': tensor([2])}
memo={'dataloader': <torch.utils.data.dataloader.DataLoader object at 0x14fe15ed0>}
Traceback (most recent call last):
  File "/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/runners.py", line 200, in _run
    memo = self._run_iter(batch, memo)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_22882/2137902126.py", line 4, in _run_iter
    raise CustomError('faulty runner')
CustomError: faulty runner[m



[2023-07-21 11:00:11,109 22882:140704357856832][runners.py:61 todd.FaultyValidator.custom_validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R
[2023-07-21 11:00:11,111 22882:140704357856832][monitor.py:29 todd.FaultyValidator.custom_validator __exit__] ERROR: Unable to run iter_=1
batch={'x': tensor([1]), 'y': tensor([2])}
memo={'dataloader': <torch.utils.data.dataloader.DataLoader object at 0x14fe15ed0>}
Traceback (most recent call last):
  File "/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/runners.py", line 200, in _run
    memo = self._run_iter(batch, memo)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_22882/2137902126.py", line 4, in _run_iter
    raise CustomError('faulty runner')
CustomError: faulty runner


### Priorities

## Strategies

In [42]:
strategy_load_model_from_demo = checkpoint_load_from_callback_demo.copy()

In [43]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        strategy_load_model_from_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.run()

    !echo
    !echo {'-' * 20}
    !echo

    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        strategy_load_model_from_demo, 
        work_dir=dict(root=work_dirs),
    )
    runner.strategy.load_model_from(os.path.join(work_dirs, 'custom_epoch_based_trainer', 'checkpoints', 'epoch_2', 'model.pth'))
    runner.run()

[2m[2023-07-21 11:00:11,406 22882:140704357856832][runners.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:11,407 22882:140704357856832][log.py:55 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-21 11:00:11,412 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=93.006 weight=0.382 batch={'x': tensor([62, 53]), 'y': tensor([124, 106])}
[2023-07-21 11:00:11,416 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=21.133 weight=1.465 batch={'x': tensor([38, 41]), 'y': tensor([76, 82])}
[2023-07-21 11:00:11,419 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=1.210 weight=2.027 batch={'x': tensor([59, 29]), 'y


--------------------



[2m[2023-07-21 11:00:11,926 22882:140704357856832][runners.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-21 11:00:11,927 22882:140704357856832][base.py:60 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer load_model_from] INFO: Loading model from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmptoarcw_k/custom_epoch_based_trainer/checkpoints/epoch_2/model.pth
[2023-07-21 11:00:11,930 22882:140704357856832][log.py:55 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-21 11:00:11,935 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=7.035 weight=1.832 batch={'x': tensor([34, 50]), 'y': tensor([ 68, 100])}
[2023-07-21 11:00:11,938 22882:140704357856832][log.py:49 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=0.000

## Dry Run

In [44]:
todd.Store.DRY_RUN = True