# Best Practices for Using Runners

In [1]:
%pip uninstall -y todd_ai
%pip install ..

Found existing installation: todd-ai 0.3.0
Uninstalling todd-ai-0.3.0:
  Successfully uninstalled todd-ai-0.3.0
Note: you may need to restart the kernel to use updated packages.
Processing /Users/bytedance/Developer/todd
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: todd-ai
  Building wheel for todd-ai (pyproject.toml) ... [?25ldone
[?25h  Created wheel for todd-ai: filename=todd_ai-0.3.0-py3-none-any.whl size=83430 sha256=42c79faa6bd05058926fbf0fbabd51b91a374adce4aeef76642ad0fa1b2a9f20
  Stored in directory: /private/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/pip-ephem-wheel-cache-535qpn9m/wheels/15/ef/5a/9fc12e257ce5cef16b333a2ed6c992ff9cbcc9167f7199e6ac
Successfully built todd-ai
Installing collected packages: todd-ai
Successfully installed todd-ai-0.3.0

[1m[[0m

In [2]:
import os
import tempfile
from typing import Any, TypedDict, cast

import torch
import torch.nn.functional as F
import torch.utils.data

import todd

Memo = dict[str, Any]

[2023-07-05 20:41:35,709 89557:140704704583232][patches.py:82 todd <module>] INFO: `ipdb` is installed. Using it for debugging.


## Preparation

### Models

Models should be built by users.
The same model can be used by multiple runners, such as a trainer and a validator, simultaneously.

In [3]:
class Model(todd.Module):

    def __init__(self) -> None:
        super().__init__()
        self._weight = torch.nn.Parameter(torch.tensor(0.0))

    @property
    def weight(self) -> torch.nn.Parameter:
        return self._weight

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self._weight

### Datasets

In [4]:
class Sample(TypedDict):
    x: int
    y: int


In contrast to models, datasets are built inside runners.

In [5]:
class Dataset(torch.utils.data.Dataset[int]):

    def __init__(self, n: int) -> None:
        self._data = list(range(1, n + 1))

    def __len__(self) -> int:
        return len(self._data)

    def __getitem__(self, index: int) -> Sample:
        x=self._data[index]
        return Sample(x=x, y=x * 2)

In [6]:
class Batch(TypedDict):
    x: torch.Tensor
    y: torch.Tensor

### Runners

In [7]:
class RunnerMixin(todd.runners.BaseRunner):

    def _build_dataloader(
        self,
        config: todd.Config,
    ) -> torch.utils.data.DataLoader:
        dataset = Dataset(**config.pop('dataset'))
        return torch.utils.data.DataLoader(dataset, **config)

    def _run_iter(self, batch: Batch, memo: Memo) -> None:
        y: torch.Tensor = self._model(batch['x'])
        loss = F.l1_loss(y, batch['y'])
        memo['loss'] = loss
        if 'log' in memo:
            memo['log']['loss'] = f'{loss.item():.3f}'

In [8]:
class TrainerMixin(RunnerMixin):

    def _run_iter(self, batch: Batch, memo: Memo) -> None:
        super()._run_iter(batch, memo)
        if 'log' in memo:
            model = cast(Model, self._model)
            memo['log']['weight'] = f'{model.weight.item():.3f}'
            memo['log']['batch'] = str(batch)

In [9]:
@todd.RunnerRegistry.register()
class CustomValidator(RunnerMixin, todd.runners.Validator):
    pass

In [10]:
@todd.RunnerRegistry.register()
class CustomIterBasedTrainer(TrainerMixin, todd.runners.IterBasedTrainer):
    pass

In [11]:
@todd.RunnerRegistry.register()
class CustomEpochBasedTrainer(TrainerMixin, todd.runners.EpochBasedTrainer):
    pass

## Validation

Define and register the validator.

Define the validator config. 
`config` will be reused by trainers.

In [12]:
validator_config = todd.Config(
    type='CustomValidator',
    name='custom_validator',
    dataloader=dict(batch_size=1, dataset=dict(n=20)),
    callbacks=dict(type='LogCallback', interval=5),
)

Build and run the validator.
Logs will be saved to the working directory.

In [13]:

with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomValidator = todd.RunnerRegistry.build(
        validator_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()
    
    !echo
    !tree $work_dirs_root

[2m[2023-07-05 20:41:37,312 89557:140704704583232][runners.py:59 todd.CustomValidator.custom_validator __init__] DEBUG: Runner custom_validator initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:37,315 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-05 20:41:37,316 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-05 20:41:37,318 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-05 20:41:37,320 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpg6ja8dcx[0m
└── [1;36mcustom_validator[0m

2 directories, 0 files


In [14]:
validator_config.callbacks.with_file_handler = True

In [15]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomValidator = todd.RunnerRegistry.build(
        validator_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

    !echo
    !tree {work_dirs_root}
    !echo
    !cat $(find {work_dirs_root}/custom_validator -name '*.log' -print -quit)

[2m[2023-07-05 20:41:37,622 89557:140704704583232][runners.py:59 todd.CustomValidator.custom_validator __init__] DEBUG: Runner custom_validator initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:37,627 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-05 20:41:37,631 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-05 20:41:37,635 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-05 20:41:37,637 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpfb6iuce5[0m
└── [1;36mcustom_validator[0m
    └── 2023-07-05T20-41-37_626231-08-00.log

2 directories, 1 file

[2023-07-05 20:41:37,627 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-05 20:41:37,631 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-05 20:41:37,635 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-05 20:41:37,637 89557:140704704583232][log.py:42 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000


## Train

In [16]:
trainer_config = validator_config.copy()
trainer_config.pop('type')
trainer_config.dataloader = todd.Config(batch_size=2, shuffle=True, dataset=dict(n=67))
trainer_config.optimizer = todd.Config(type='SGD', lr=0.005) 

### Iteration Based

In [17]:
iter_based_trainer_config = trainer_config.copy()
iter_based_trainer_config.type = 'CustomIterBasedTrainer'
iter_based_trainer_config.name = 'custom_iter_based_trainer'
iter_based_trainer_config.iters = 53

In [18]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        iter_based_trainer_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

[2m[2023-07-05 20:41:38,258 89557:140704704583232][runners.py:59 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Runner custom_iter_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:38,263 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=67.000 weight=0.000 batch={'x': tensor([33, 34]), 'y': tensor([66, 68])}
[2023-07-05 20:41:38,265 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=85.000 weight=0.000 batch={'x': tensor([19, 66]), 'y': tensor([ 38, 132])}
[2023-07-05 20:41:38,267 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=26.000 weight=0.000 batch={'x': tensor([ 2, 24]), 'y': tensor([ 4, 48])}
[2023-07-05 20:41:38,269 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer a

Trainers increment `todd.Store.ITER` to keep track of the training progress.
If multiple trainers are to be run, `todd.Store.ITER` must be manually reset to zero.

### Epoch Based

In [19]:
epoch_based_trainer_config = trainer_config.copy()
epoch_based_trainer_config.type = 'CustomEpochBasedTrainer'
epoch_based_trainer_config.name = 'custom_epoch_based_trainer'
epoch_based_trainer_config.epochs = 3

In [20]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        epoch_based_trainer_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

[2m[2023-07-05 20:41:38,296 89557:140704704583232][runners.py:59 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Runner custom_epoch_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:38,297 89557:140704704583232][log.py:50 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-05 20:41:38,299 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=101.000 weight=0.000 batch={'x': tensor([62, 39]), 'y': tensor([124,  78])}
[2023-07-05 20:41:38,301 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=84.000 weight=0.000 batch={'x': tensor([58, 26]), 'y': tensor([116,  52])}
[2023-07-05 20:41:38,303 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=83.000 weight=0.000 b

## Callbacks

### Optimize

In [21]:
optimize_callback_config = iter_based_trainer_config.copy()
log_callback = optimize_callback_config.callbacks
optimize_callback = todd.Config(type='OptimizeCallback')
composed_callback = todd.Config(type='ComposedCallback', callbacks=[optimize_callback, log_callback])
optimize_callback_config.callbacks = composed_callback

In [22]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        optimize_callback_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

[2m[2023-07-05 20:41:38,356 89557:140704704583232][runners.py:59 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Runner custom_iter_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:38,360 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=5.140 weight=0.715 batch={'x': tensor([1, 7]), 'y': tensor([ 2, 14])}
[2023-07-05 20:41:38,363 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=35.213 weight=1.217 batch={'x': tensor([30, 60]), 'y': tensor([ 60, 120])}
[2023-07-05 20:41:38,366 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=4.356 weight=2.102 batch={'x': tensor([44, 41]), 'y': tensor([88, 82])}
[2023-07-05 20:41:38,369 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after

### Learning Rate Schedule

In [23]:
lr_schedule_callback = todd.Config(type='LrScheduleCallback')
lr_schedule_callback_config = optimize_callback_config.copy()
lr_schedule_callback_config.lr_scheduler = todd.Config(type='LinearLR', total_iters=10)
lr_schedule_callback_config.callbacks.callbacks = [
    optimize_callback,
    lr_schedule_callback,
    log_callback,
]

In [24]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_callback_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

[2m[2023-07-05 20:41:38,402 89557:140704704583232][runners.py:59 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Runner custom_iter_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:38,406 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=53.416 weight=0.405 batch={'x': tensor([32, 35]), 'y': tensor([64, 70])} lr=['3.333e-03']
[2023-07-05 20:41:38,410 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=41.335 weight=0.992 batch={'x': tensor([25, 57]), 'y': tensor([ 50, 114])} lr=['5.000e-03']
[2023-07-05 20:41:38,413 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=3.472 weight=1.893 batch={'x': tensor([ 7, 58]), 'y': tensor([ 14, 116])} lr=['5.000e-03']
[2023-07-05 20:41:38,416 89557:140704704583232][log.py:42 tod

In [25]:
lr_schedule_by_epoch_callback = lr_schedule_callback.copy()
lr_schedule_by_epoch_callback_config = epoch_based_trainer_config.copy()
lr_schedule_by_epoch_callback_config.lr_scheduler = lr_schedule_callback_config.lr_scheduler
lr_schedule_by_epoch_callback_config.callbacks = dict(
    type='ComposedCallback',
    callbacks=[optimize_callback, lr_schedule_by_epoch_callback, log_callback],
)

In [26]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_by_epoch_callback_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

[2m[2023-07-05 20:41:38,450 89557:140704704583232][runners.py:59 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Runner custom_epoch_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:38,452 89557:140704704583232][log.py:50 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-05 20:41:38,456 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=84.326 weight=0.167 batch={'x': tensor([35, 57]), 'y': tensor([ 70, 114])} lr=['3.333e-03']
[2023-07-05 20:41:38,459 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=34.273 weight=0.818 batch={'x': tensor([43, 15]), 'y': tensor([86, 30])} lr=['5.000e-03']
[2023-07-05 20:41:38,462 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/

In [27]:
lr_schedule_by_epoch_callback_config.callbacks.callbacks[1].by_epoch = True

In [28]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_by_epoch_callback_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

[2m[2023-07-05 20:41:38,534 89557:140704704583232][runners.py:59 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Runner custom_epoch_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:38,536 89557:140704704583232][log.py:50 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-05 20:41:38,540 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=67.093 weight=0.187 batch={'x': tensor([12, 62]), 'y': tensor([ 24, 124])} lr=['1.667e-03']
[2023-07-05 20:41:38,543 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=55.777 weight=0.493 batch={'x': tensor([59, 15]), 'y': tensor([118,  30])} lr=['1.667e-03']
[2023-07-05 20:41:38,546 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [1

### Checkpoint

In [29]:
checkpoint_callback = todd.Config(type='CheckpointCallback', every_n_iters=10)
checkpoint_callback_config = iter_based_trainer_config.copy()
checkpoint_callback_config.callbacks = dict(
    type='ComposedCallback',
    callbacks=[checkpoint_callback, log_callback],
)

In [30]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_callback_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

    !echo
    !tree {work_dirs_root}
    !echo
    !cat $(find {work_dirs_root}/custom_iter_based_trainer -name '*.log' -print -quit)
    !echo

    checkpoint_path = os.path.join(work_dirs_root, 'custom_iter_based_trainer', 'iter_50.pth')
    checkpoint: dict[str, Any] = torch.load(checkpoint_path, 'cpu')
    print(checkpoint.keys())
    print(checkpoint['meta'])

[2m[2023-07-05 20:41:38,624 89557:140704704583232][runners.py:59 todd.CustomIterBasedTrainer.custom_iter_based_trainer __init__] DEBUG: Runner custom_iter_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:38,627 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=85.000 weight=0.000 batch={'x': tensor([19, 66]), 'y': tensor([ 38, 132])}
[2023-07-05 20:41:38,629 89557:140704704583232][checkpoint.py:29 todd.CustomIterBasedTrainer.custom_iter_based_trainer _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp1ubutelf/custom_iter_based_trainer/iter_10.pth
[2023-07-05 20:41:38,631 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=112.000 weight=0.000 batch={'x': tensor([67, 45]), 'y': tensor([134,  90])}
[2023-07-05 20:41:38,633 89557:140704704583232][log.py:42 todd.CustomIterBasedTrain


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp1ubutelf[0m
└── [1;36mcustom_iter_based_trainer[0m
    ├── 2023-07-05T20-41-38_626008-08-00.log
    ├── iter_10.pth
    ├── iter_20.pth
    ├── iter_30.pth
    ├── iter_40.pth
    ├── iter_50.pth
    └── latest.pth

2 directories, 7 files

[2023-07-05 20:41:38,627 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=85.000 weight=0.000 batch={'x': tensor([19, 66]), 'y': tensor([ 38, 132])}
[2023-07-05 20:41:38,629 89557:140704704583232][checkpoint.py:29 todd.CustomIterBasedTrainer.custom_iter_based_trainer _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp1ubutelf/custom_iter_based_trainer/iter_10.pth
[2023-07-05 20:41:38,631 89557:140704704583232][log.py:42 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=112.000 weight=0.000 batch={'x': tensor([67, 45]), 'y': tensor([134

In [31]:
checkpoint_by_epoch_callback_config = epoch_based_trainer_config.copy()
checkpoint_by_epoch_callback_config.callbacks = dict(
    type='ComposedCallback',
    callbacks=[checkpoint_callback, log_callback],
)

In [32]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_by_epoch_callback_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

    !echo
    !tree {work_dirs_root}
    !echo

    checkpoint_path = os.path.join(work_dirs_root, 'custom_epoch_based_trainer', 'iter_90.pth')
    checkpoint: dict[str, Any] = torch.load(checkpoint_path, 'cpu')
    print(checkpoint.keys())
    print(checkpoint['meta'])

[2m[2023-07-05 20:41:39,353 89557:140704704583232][runners.py:59 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Runner custom_epoch_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 20:41:39,356 89557:140704704583232][log.py:50 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-05 20:41:39,364 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=63.000 weight=0.000 batch={'x': tensor([48, 15]), 'y': tensor([96, 30])}
[2023-07-05 20:41:39,367 89557:140704704583232][checkpoint.py:29 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpoe6pvopi/custom_epoch_based_trainer/iter_10.pth
[2023-07-05 20:41:39,369 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpoe6pvopi[0m
└── [1;36mcustom_epoch_based_trainer[0m
    ├── 2023-07-05T20-41-39_356076-08-00.log
    ├── iter_10.pth
    ├── iter_100.pth
    ├── iter_20.pth
    ├── iter_30.pth
    ├── iter_40.pth
    ├── iter_50.pth
    ├── iter_60.pth
    ├── iter_70.pth
    ├── iter_80.pth
    ├── iter_90.pth
    └── latest.pth

2 directories, 12 files

dict_keys(['meta', 'model', 'optimizer'])
{'iter_': 90, 'epoch': 3}


In [34]:
checkpoint_by_epoch_callback_config.callbacks.callbacks[0].update(
    every_n_iters=0,
    every_n_epochs=1,
)

In [35]:
with tempfile.TemporaryDirectory() as work_dirs_root:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_by_epoch_callback_config, 
        todd.Config(work_dirs_root=work_dirs_root, model=Model()),
    )
    runner.run()

    !echo
    !tree {work_dirs_root}
    !echo

[2m[2023-07-05 21:24:46,684 89557:140704704583232][runners.py:59 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer __init__] DEBUG: Runner custom_epoch_based_trainer initialized by bytedance@C02G870SMD6R[m
[2023-07-05 21:24:46,686 89557:140704704583232][log.py:50 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-05 21:24:46,689 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=88.000 weight=0.000 batch={'x': tensor([39, 49]), 'y': tensor([78, 98])}
[2023-07-05 21:24:46,691 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=33.000 weight=0.000 batch={'x': tensor([28,  5]), 'y': tensor([56, 10])}
[2023-07-05 21:24:46,695 89557:140704704583232][log.py:42 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=69.000 weight=0.000 batch=


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpqaqm9v2n[0m
└── [1;36mcustom_epoch_based_trainer[0m
    ├── 2023-07-05T21-24-46_685671-08-00.log
    ├── epoch_1.pth
    ├── epoch_2.pth
    ├── epoch_3.pth
    └── latest.pth

2 directories, 5 files



## Dry Run

In [33]:
todd.Store.DRY_RUN = True

If `DRY_RUN` is enabled, the runner will stop upon the first log message.

## State Dicts