# Best Practices for Using Runners

In [1]:
%pip uninstall -y todd_ai
%pip install ..

Found existing installation: todd-ai 0.3.0
Uninstalling todd-ai-0.3.0:
  Successfully uninstalled todd-ai-0.3.0
Note: you may need to restart the kernel to use updated packages.
Processing /Users/bytedance/Developer/todd
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: todd-ai
  Building wheel for todd-ai (pyproject.toml) ... [?25ldone
[?25h  Created wheel for todd-ai: filename=todd_ai-0.3.0-py3-none-any.whl size=94840 sha256=2f00fdee81befbef715c24c668fa2c52ef5e61b4452e0df40ea980620c5dc838
  Stored in directory: /private/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/pip-ephem-wheel-cache-5etlvmme/wheels/15/ef/5a/9fc12e257ce5cef16b333a2ed6c992ff9cbcc9167f7199e6ac
Successfully built todd-ai
Installing collected packages: todd-ai
Successfully installed todd-ai-0.3.0

[1m[[0m

In [2]:
import os
import tempfile
from typing import Any, TypedDict, cast

import torch
import torch.nn.functional as F
import torch.utils.data

import todd

Memo = dict[str, Any]

[2023-07-13 13:45:00,583 45313:140704293135936][patches.py:72 todd <module>] INFO: `ipdb` is installed. Using it for debugging.


## Preparation

### Models

In [3]:
@todd.ModelRegistry.register()
class RunnerModel(todd.Module):

    def __init__(self) -> None:
        super().__init__()
        self._weight = torch.nn.Parameter(torch.tensor(0.0))

    @property
    def weight(self) -> torch.nn.Parameter:
        return self._weight

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self._weight

### Datasets

In [4]:
class Sample(TypedDict):
    x: int
    y: int


In [5]:
@todd.DatasetRegistry.register()
class RunnerDataset(torch.utils.data.Dataset[int]):

    def __init__(self, n: int) -> None:
        self._data = list(range(1, n + 1))

    def __len__(self) -> int:
        return len(self._data)

    def __getitem__(self, index: int) -> Sample:
        x = self._data[index]
        return Sample(x=x, y=x * 2)

In [6]:
class Batch(TypedDict):
    x: torch.Tensor
    y: torch.Tensor

### Runners

In [7]:
class RunnerMixin(todd.runners.BaseRunner):

    def _run_iter(self, batch: Batch, memo: Memo) -> None:
        y: torch.Tensor = self._strategy.model(batch['x'])
        loss = F.l1_loss(y, batch['y'])
        memo['loss'] = loss
        if 'log' in memo:
            memo['log']['loss'] = f'{loss.item():.3f}'

In [8]:
class TrainerMixin(RunnerMixin):

    def _run_iter(self, batch: Batch, memo: Memo) -> None:
        super()._run_iter(batch, memo)
        if 'log' in memo:
            model = cast(RunnerModel, self._strategy.model)
            memo['log']['weight'] = f'{model.weight.item():.3f}'
            memo['log']['batch'] = str(batch)

In [9]:
@todd.RunnerRegistry.register()
class CustomValidator(RunnerMixin, todd.runners.Validator):
    pass

In [10]:
@todd.RunnerRegistry.register()
class CustomIterBasedTrainer(TrainerMixin, todd.runners.IterBasedTrainer):
    pass

In [11]:
@todd.RunnerRegistry.register()
class CustomEpochBasedTrainer(TrainerMixin, todd.runners.EpochBasedTrainer):
    pass

## Validation

In [12]:
validator_demo = todd.Config(
    type='CustomValidator',
    name='custom_validator',
    dataloader=dict(batch_size=1, dataset=dict(type='RunnerDataset', n=20)),
    strategy=dict(type='VanillaStrategy', model=dict(type='RunnerModel')),
    callbacks=dict(type='LogCallback', interval=5),
)

In [13]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomValidator = todd.RunnerRegistry.build(
        validator_demo, 
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()
    
    !echo
    !tree $work_dirs

[2m[2023-07-13 13:45:01,166 45313:140704293135936][log.py:37 todd.CustomValidator.custom_validator connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:01,169 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-13 13:45:01,171 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-13 13:45:01,172 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-13 13:45:01,174 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpt_pf_hxe[0m
└── [1;36mcustom_validator[0m

2 directories, 0 files


## Train

In [14]:
trainer_demo = validator_demo.copy()
trainer_demo.pop('type')
trainer_demo.dataloader = todd.Config(
    batch_size=2, shuffle=True, dataset=dict(type='RunnerDataset', n=67),
)
trainer_demo.optimizer = todd.Config(type='SGD', lr=0.005)


### Iteration Based

In [15]:
iter_based_trainer_demo = trainer_demo.copy()
iter_based_trainer_demo.type = 'CustomIterBasedTrainer'
iter_based_trainer_demo.name = 'custom_iter_based_trainer'
iter_based_trainer_demo.iters = 53

In [16]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        iter_based_trainer_demo,
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

[2m[2023-07-13 13:45:01,478 45313:140704293135936][log.py:37 todd.CustomIterBasedTrainer.custom_iter_based_trainer connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:01,482 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=91.000 weight=0.000 batch={'x': tensor([31, 60]), 'y': tensor([ 62, 120])}
[2023-07-13 13:45:01,484 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=86.000 weight=0.000 batch={'x': tensor([65, 21]), 'y': tensor([130,  42])}
[2023-07-13 13:45:01,486 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=17.000 weight=0.000 batch={'x': tensor([ 2, 15]), 'y': tensor([ 4, 30])}
[2023-07-13 13:45:01,488 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [20

### Epoch Based

In [17]:
epoch_based_trainer_demo = trainer_demo.copy()
epoch_based_trainer_demo.type = 'CustomEpochBasedTrainer'
epoch_based_trainer_demo.name = 'custom_epoch_based_trainer'
epoch_based_trainer_demo.epochs = 3

In [18]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        epoch_based_trainer_demo,
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

[2m[2023-07-13 13:45:01,514 45313:140704293135936][log.py:37 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:01,515 45313:140704293135936][log.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-13 13:45:01,518 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=125.000 weight=0.000 batch={'x': tensor([61, 64]), 'y': tensor([122, 128])}
[2023-07-13 13:45:01,519 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=80.000 weight=0.000 batch={'x': tensor([50, 30]), 'y': tensor([100,  60])}
[2023-07-13 13:45:01,521 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=53.000 weight=0.000 batch={'x': tensor([49,  4]), 'y'

## Callbacks

### Log

In [19]:
log_callback_demo = validator_demo.copy()
log_callback = log_callback_demo.callbacks
log_callback.with_file_handler = True
log_callback_demo.callbacks = [log_callback]

In [20]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomValidator = todd.RunnerRegistry.build(
        log_callback_demo, 
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo
    !cat {work_dirs}/custom_validator/*.log

[2m[2023-07-13 13:45:01,577 45313:140704293135936][log.py:37 todd.CustomValidator.custom_validator connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:01,580 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-13 13:45:01,582 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-13 13:45:01,584 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-13 13:45:01,587 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpp6ovipjj[0m
└── [1;36mcustom_validator[0m
    └── 2023-07-13T13-45-01_576398-08-00.log

2 directories, 1 file

[2023-07-13 13:45:01,577 45313:140704293135936][log.py:37 todd.CustomValidator.custom_validator connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R
[2023-07-13 13:45:01,580 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [5/20] loss=10.000
[2023-07-13 13:45:01,582 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [10/20] loss=20.000
[2023-07-13 13:45:01,584 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [15/20] loss=30.000
[2023-07-13 13:45:01,587 45313:140704293135936][log.py:52 todd.CustomValidator.custom_validator after_run_iter] INFO: Iter [20/20] loss=40.000


### Optimize

In [21]:
optimize_callback_demo = iter_based_trainer_demo.copy()
optimize_callback = todd.Config(type='OptimizeCallback')
optimize_callback_demo.callbacks = [optimize_callback, log_callback]

In [22]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        optimize_callback_demo,
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

[2m[2023-07-13 13:45:02,195 45313:140704293135936][log.py:37 todd.CustomIterBasedTrainer.custom_iter_based_trainer connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:02,199 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=33.994 weight=0.612 batch={'x': tensor([37, 12]), 'y': tensor([74, 24])}
[2023-07-13 13:45:02,205 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=26.705 weight=1.510 batch={'x': tensor([56, 53]), 'y': tensor([112, 106])}
[2023-07-13 13:45:02,208 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=2.827 weight=1.805 batch={'x': tensor([ 8, 21]), 'y': tensor([16, 42])}
[2023-07-13 13:45:02,212 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [20/53

### Learning Rate Schedule

In [23]:
lr_schedule_callback_demo = iter_based_trainer_demo.copy()
lr_schedule_callback = todd.Config(
    type='LrScheduleCallback',
    lr_scheduler=dict(type='LinearLR', total_iters=10),
)
lr_schedule_callback_demo.callbacks = [optimize_callback, lr_schedule_callback, log_callback,]


In [24]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_callback_demo,
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

[2m[2023-07-13 13:45:02,266 45313:140704293135936][log.py:37 todd.CustomIterBasedTrainer.custom_iter_based_trainer connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:02,270 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=108.950 weight=0.184 batch={'x': tensor([53, 67]), 'y': tensor([106, 134])} lr=['3.333e-03']
[2023-07-13 13:45:02,273 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=43.540 weight=0.951 batch={'x': tensor([46, 37]), 'y': tensor([92, 74])} lr=['5.000e-03']
[2023-07-13 13:45:02,277 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=0.226 weight=1.850 batch={'x': tensor([1, 2]), 'y': tensor([2, 4])} lr=['5.000e-03']
[2023-07-13 13:45:02,280 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter

In [25]:
lr_schedule_by_epoch_callback_demo = epoch_based_trainer_demo.copy()
lr_schedule_by_epoch_callback = lr_schedule_callback.copy()
lr_schedule_by_epoch_callback.by_epoch = True
lr_schedule_by_epoch_callback_demo.callbacks = [
    optimize_callback,
    lr_schedule_by_epoch_callback, log_callback,
]


In [26]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_by_epoch_callback_demo,
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

[2m[2023-07-13 13:45:02,315 45313:140704293135936][log.py:37 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:02,316 45313:140704293135936][log.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-13 13:45:02,321 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=42.940 weight=0.211 batch={'x': tensor([38, 10]), 'y': tensor([76, 20])} lr=['1.667e-03']
[2023-07-13 13:45:02,324 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=54.719 weight=0.541 batch={'x': tensor([36, 39]), 'y': tensor([72, 78])} lr=['1.667e-03']
[2023-07-13 13:45:02,327 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=53.474 weight=0.798 batc

In [27]:
lr_schedule_with_scaler_callback_demo = iter_based_trainer_demo.copy()
lr_schedule_with_scaler_callback = lr_schedule_callback.copy()
lr_schedule_with_scaler_callback.lr_scaler = dict(base_batch_size=1)
lr_schedule_with_scaler_callback_demo.callbacks = [
    optimize_callback,
    lr_schedule_with_scaler_callback,
    log_callback,
]

In [28]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        lr_schedule_with_scaler_callback_demo,
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

[2023-07-13 13:45:02,427 45313:140704293135936][lr_schedule.py:54 todd.CustomIterBasedTrainer.custom_iter_based_trainer _scale_lr] INFO: base_batch_size=1 batch_size=2 lr_scaler=2.0
[2m[2023-07-13 13:45:02,428 45313:140704293135936][log.py:37 todd.CustomIterBasedTrainer.custom_iter_based_trainer connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:02,432 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=30.421 weight=0.677 batch={'x': tensor([16, 30]), 'y': tensor([32, 60])} lr=['6.667e-03']
[2023-07-13 13:45:02,436 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=3.282 weight=2.068 batch={'x': tensor([49, 48]), 'y': tensor([98, 96])} lr=['1.000e-02']
[2023-07-13 13:45:02,439 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [15/53] loss=1.32

### Checkpoint

In [29]:
checkpoint_callback_demo = iter_based_trainer_demo.copy()
checkpoint_callback = todd.Config(type='CheckpointCallback', interval=10)
checkpoint_callback_demo.callbacks = [checkpoint_callback, log_callback]

In [30]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomIterBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_callback_demo, 
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    checkpoint_path = os.path.join(work_dirs, 'custom_iter_based_trainer', 'iter_50.pth')
    checkpoint: dict[str, Any] = torch.load(checkpoint_path, 'cpu')
    print(checkpoint.keys())
    print(checkpoint['meta'])

[2m[2023-07-13 13:45:02,481 45313:140704293135936][log.py:37 todd.CustomIterBasedTrainer.custom_iter_based_trainer connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:02,484 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [5/53] loss=97.000 weight=0.000 batch={'x': tensor([45, 52]), 'y': tensor([ 90, 104])}
[2023-07-13 13:45:02,486 45313:140704293135936][checkpoint.py:34 todd.CustomIterBasedTrainer.custom_iter_based_trainer _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpew7rag0_/custom_iter_based_trainer/iter_10.pth
[2023-07-13 13:45:02,488 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer after_run_iter] INFO: Iter [10/53] loss=98.000 weight=0.000 batch={'x': tensor([56, 42]), 'y': tensor([112,  84])}
[2023-07-13 13:45:02,491 45313:140704293135936][log.py:52 todd.CustomIterBasedTrainer.custom_iter_based_trainer aft


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpew7rag0_[0m
└── [1;36mcustom_iter_based_trainer[0m
    ├── 2023-07-13T13-45-02_480950-08-00.log
    ├── iter_10.pth
    ├── iter_20.pth
    ├── iter_30.pth
    ├── iter_40.pth
    ├── iter_50.pth
    └── latest.pth

2 directories, 7 files

dict_keys(['meta', 'strategy', 'optimizer'])
{'iter_': 50}


In [31]:
checkpoint_by_epoch_callback_demo = epoch_based_trainer_demo.copy()
checkpoint_by_epoch_callback = checkpoint_callback.copy()
checkpoint_by_epoch_callback.update(interval=1, by_epoch=True)
checkpoint_by_epoch_callback_demo.callbacks = [
    checkpoint_by_epoch_callback,
    log_callback,
]

In [32]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomEpochBasedTrainer = todd.RunnerRegistry.build(
        checkpoint_by_epoch_callback_demo, 
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    checkpoint_path = os.path.join(work_dirs, 'custom_epoch_based_trainer', 'epoch_2.pth')
    checkpoint: dict[str, Any] = torch.load(checkpoint_path, 'cpu')
    print(checkpoint.keys())
    print(checkpoint['meta'])

[2m[2023-07-13 13:45:02,965 45313:140704293135936][log.py:37 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2023-07-13 13:45:02,966 45313:140704293135936][log.py:61 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2023-07-13 13:45:02,969 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [5/102] loss=90.000 weight=0.000 batch={'x': tensor([36, 54]), 'y': tensor([ 72, 108])}
[2023-07-13 13:45:02,972 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [10/102] loss=74.000 weight=0.000 batch={'x': tensor([67,  7]), 'y': tensor([134,  14])}
[2023-07-13 13:45:02,974 45313:140704293135936][log.py:52 todd.CustomEpochBasedTrainer.custom_epoch_based_trainer after_run_iter] INFO: Iter [15/102] loss=36.000 weight=0.000 batch={'x': tensor([12, 24]), 'y':


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpwjl6ntis[0m
└── [1;36mcustom_epoch_based_trainer[0m
    ├── 2023-07-13T13-45-02_965224-08-00.log
    ├── epoch_1.pth
    ├── epoch_2.pth
    ├── epoch_3.pth
    └── latest.pth

2 directories, 5 files

dict_keys(['meta', 'strategy', 'optimizer'])
{'iter_': 68, 'epoch': 2}


### Monitor

In [33]:
class CustomError(RuntimeError):
    pass

In [34]:
class FaultyRunnerMixin(todd.runners.BaseRunner):
    
    def _run_iter(self, *args, **kwargs) -> None:
        raise CustomError('faulty runner')

In [35]:
@todd.RunnerRegistry.register()
class FaultyValidator(FaultyRunnerMixin, todd.runners.Validator):
    pass

In [36]:
@todd.RunnerRegistry.register()
class FaultyIterBasedTrainer(FaultyRunnerMixin, todd.runners.IterBasedTrainer):
    pass

In [37]:
@todd.RunnerRegistry.register()
class FaultyEpochBasedTrainer(FaultyRunnerMixin, todd.runners.EpochBasedTrainer):
    pass

In [38]:
monitor_callback_demo = validator_demo.copy()
monitor_callback_demo.type = 'FaultyValidator'
monitor_callback = todd.Config(type='MonitorCallback')
monitor_callback_demo.callbacks = [monitor_callback, log_callback]

In [39]:
with tempfile.TemporaryDirectory() as work_dirs:
    runner: CustomValidator = todd.RunnerRegistry.build(
        monitor_callback_demo, 
        todd.Config(work_dir=dict(root=work_dirs)),
    )
    try:
        runner.run()
    except CustomError as e:
        pass

    !echo
    !cat {work_dirs}/custom_validator/*.log

[2m[2023-07-13 13:45:03,460 45313:140704293135936][log.py:37 todd.FaultyValidator.custom_validator connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[1;31m[2023-07-13 13:45:03,462 45313:140704293135936][monitor.py:28 todd.FaultyValidator.custom_validator __exit__] ERROR: Unable to run iter_=1
batch={'x': tensor([1]), 'y': tensor([2])}
memo={'dataloader': <torch.utils.data.dataloader.DataLoader object at 0x14e4cda50>}
Traceback (most recent call last):
  File "/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/runners.py", line 169, in _run
    self._run_iter(batch, memo)
  File "/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_45313/3253115116.py", line 4, in _run_iter
    raise CustomError('faulty runner')
CustomError: faulty runner[m



[2023-07-13 13:45:03,460 45313:140704293135936][log.py:37 todd.FaultyValidator.custom_validator connect] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R
[2023-07-13 13:45:03,462 45313:140704293135936][monitor.py:28 todd.FaultyValidator.custom_validator __exit__] ERROR: Unable to run iter_=1
batch={'x': tensor([1]), 'y': tensor([2])}
memo={'dataloader': <torch.utils.data.dataloader.DataLoader object at 0x14e4cda50>}
Traceback (most recent call last):
  File "/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/runners.py", line 169, in _run
    self._run_iter(batch, memo)
  File "/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_45313/3253115116.py", line 4, in _run_iter
    raise CustomError('faulty runner')
CustomError: faulty runner


## Dry Run

In [40]:
todd.Store.DRY_RUN = True

## State Dicts

## Evaluation

## Strategies