# Best Practices for Using Runners

In [1]:
%pip uninstall -y todd_ai
%pip install --no-build-isolation --extra-index-url https://pypi.org/simple .. > /dev/null


Found existing installation: todd-ai 0.4.0
Uninstalling todd-ai-0.4.0:
  Successfully uninstalled todd-ai-0.4.0
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pathlib
import tempfile
import time
from pprint import pprint
from typing import Any, NoReturn, TypedDict

import torch
import torch.nn.functional as F
import torch.utils.data
from torch import nn

import todd
from todd.runners import Memo

[2024-03-14 12:17:47,682 62058:140704275689088][patches.py:9 todd <module>] INFO: `ipdb` is installed. Using it for debugging.


## Preparation

### Models

In [3]:
@todd.ModelRegistry.register_()
class RunnerModel(nn.Module):

    def __init__(self) -> None:
        super().__init__()
        self._weight = torch.nn.Parameter(torch.tensor(0.0))

    @property
    def weight(self) -> torch.nn.Parameter:
        return self._weight

    def _forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self._weight

    def forward(
        self,
        runner: todd.runners.BaseRunner,
        batch,
        memo: Memo,
        *args,
        **kwargs,
    ) -> Memo:
        log: dict[str, Any] | None = memo.get("log")
        y = self._forward(batch["x"])
        loss = F.l1_loss(y, batch["y"])
        memo["loss"] = loss
        if log is not None:
            log["batch"] = str(batch)
            log["weight"] = f"{self._weight.item():.3f}"
            log["loss"] = f"{loss:.3f}"
        return memo

### Datasets

In [4]:
class Sample(TypedDict):
    x: int
    y: int

In [5]:
@todd.DatasetRegistry.register_()
class RunnerDataset(torch.utils.data.Dataset[int]):

    def __init__(self, n: int) -> None:
        self._data = list(range(1, n + 1))

    def __len__(self) -> int:
        return len(self._data)

    def __getitem__(self, index: int) -> Sample:
        x = self._data[index]
        return Sample(x=x, y=x * 2)

In [6]:
class Batch(TypedDict):
    x: torch.Tensor
    y: torch.Tensor

## Validators

In [7]:
config = todd.Config(
    type='Validator',
    name='validator',
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.run()

    !echo
    !tree $work_dirs


[2m[2024-03-14 12:17:49,375 62058:140704275689088][base.py:56 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpjms97dk6[0m
└── [1;36mvalidator[0m

2 directories, 0 files


In [8]:
config = todd.Config(
    type='Validator',
    name='validator',
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
    callbacks=[dict(type='LogCallback', interval=5)],
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.run()

    !echo
    !tree $work_dirs


[2m[2024-03-14 12:17:49,723 62058:140704275689088][base.py:56 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:17:49,728 62058:140704275689088][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000
[2024-03-14 12:17:49,731 62058:140704275689088][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000
[2024-03-14 12:17:49,734 62058:140704275689088][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000
[2024-03-14 12:17:49,736 62058:140704275689088][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpv4q6cgrv[0m
└── [1;36mvalidator[0m

2 directories, 0 files


## Trainers

### Iteration Based

In [9]:
config = todd.Config(
    type="IterBasedTrainer",
    name="iter_based_trainer",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[dict(type="LogCallback", interval=1)],
    optimizer=dict(type="SGD", lr=0.005),
    iters=8,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2024-03-14 12:17:50,032 62058:140704275689088][base.py:56 todd.IterBasedTrainer.iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:17:50,035 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [1/8] batch={'x': tensor([ 7, 10]), 'y': tensor([14, 20])} weight=0.000 loss=17.000
[2024-03-14 12:17:50,037 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [2/8] batch={'x': tensor([4, 6]), 'y': tensor([ 8, 12])} weight=0.000 loss=10.000
[2024-03-14 12:17:50,038 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [3/8] batch={'x': tensor([3, 8]), 'y': tensor([ 6, 16])} weight=0.000 loss=11.000
[2024-03-14 12:17:50,039 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [4/8] batch={'x': tensor([9, 2]), 'y': tensor([18,  4])} weight=0.000 loss=11.0

### Epoch Based

In [10]:
config = todd.Config(
    type="EpochBasedTrainer",
    name="epoch_based_trainer",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[dict(type="LogCallback", interval=1)],
    optimizer=dict(type="SGD", lr=0.005),
    epochs=3,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2024-03-14 12:17:50,057 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:17:50,058 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]
[2024-03-14 12:17:50,061 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [1/15] batch={'x': tensor([4, 1]), 'y': tensor([8, 2])} weight=0.000 loss=5.000
[2024-03-14 12:17:50,064 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [2/15] batch={'x': tensor([8, 6]), 'y': tensor([16, 12])} weight=0.000 loss=14.000
[2024-03-14 12:17:50,066 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [3/15] batch={'x': tensor([5, 9]), 'y': tensor([10, 18])} weight=0.000 loss=14.000
[2024-03-14 12:17:50,068 62058:140704275689088][log.py:93 todd

## Callbacks

### Log

In [11]:
config = todd.Config(
    type="Validator",
    name="log_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
    callbacks=[
        dict(
            type="LogCallback",
            interval=5,
            collect_env=dict(verbose=False),
        ),
    ],
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2024-03-14 12:17:50,216 62058:140704275689088][log.py:55 todd.Validator.log_callback init] INFO: 
platform: macOS-14.0
nvidia_smi: None
python_version: 3.11.7 (main, Dec  4 2023, 18:10:11) [Clang 15.0.0 (clang-1500.1.0.2.5)]
pytorch_version: 2.0.1
torchvision_version: 0.15.2
opencv_version: 4.7.0
todd_version: 0.4.0
cuda_home: None
git_commit_id: 0a7955a
git_status: 
M todd/runners/callbacks/checkpoint.py
 M todd/runners/callbacks/composed.py
 M todd/runners/callbacks/git.py
 M todd/runners/callbacks/interval.py
 M todd/runners/callbacks/log.py
 M todd/runners/callbacks/lr.py
 M todd/runners/callbacks/monitor.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/strategies/base.py
 M todd/runners/strategies/ddp.py
 M todd/runners/strategies/fsdp.py
 M todd/runners/utils.py
 M todd/utils/__init__.py
 M todd/utils/mixins.py
?? todd/utils/constants.py
[2m[2024-03-14 12:17:50,217 62058:140704275689088][bas

In [12]:
config = todd.Config(
    type='Validator',
    name='log_callback',
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
    callbacks=[
        dict(
            type='LogCallback',
            interval=5,
            with_file_handler=True,
        ),
    ],
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.run()

    !echo
    !tree {work_dirs}
    !echo
    !cat {work_dirs}/log_callback/*.log


[2m[2024-03-14 12:17:50,291 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:17:50,294 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000
[2024-03-14 12:17:50,296 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000
[2024-03-14 12:17:50,299 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000
[2024-03-14 12:17:50,301 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpypx13e0i[0m
└── [1;36mlog_callback[0m
    └── 2024-03-14T12-17-50_290843-08-00.log

2 directories, 1 file

[2024-03-14 12:17:50,291 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R
[2024-03-14 12:17:50,294 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000
[2024-03-14 12:17:50,296 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000
[2024-03-14 12:17:50,299 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000
[2024-03-14 12:17:50,301 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: It

In [13]:
config = todd.Config(
    type="Validator",
    name="log_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
    callbacks=[
        dict(
            type="LogCallback",
            interval=5,
            eta=dict(type="AverageETA"),
        ),
    ],
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.strategy.module.register_forward_hook(
        lambda *args, **kwargs: time.sleep(0.1)
    )
    runner.run()

[2m[2024-03-14 12:17:50,870 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:17:51,391 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] ETA 0:00:01 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000
[2024-03-14 12:17:51,912 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] ETA 0:00:01 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000
[2024-03-14 12:17:52,431 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] ETA 0:00:00 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000
[2024-03-14 12:17:52,945 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000


In [14]:
config = todd.Config(
    type="Validator",
    name="log_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
    callbacks=[
        dict(
            type="LogCallback",
            interval=5,
            eta=dict(type="EMA_ETA", ema=dict(decay=0.2)),
        ),
    ],
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.strategy.module.register_forward_hook(
        lambda *args, **kwargs: time.sleep(0.1 * min(10, runner.iter_))
    )
    runner.run()

[2m[2024-03-14 12:17:52,955 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:17:54,468 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] ETA 0:00:04 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000
[2024-03-14 12:17:58,481 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] ETA 0:00:05 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000
[2024-03-14 12:18:03,499 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] ETA 0:00:03 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000
[2024-03-14 12:18:08,518 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000


In [15]:
config = todd.Config(
    type="Validator",
    name="log_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
    callbacks=[
        dict(
            type="LogCallback",
            interval=5,
            collect_env=dict(verbose=False),
            with_file_handler=True,
            eta=dict(type="AverageETA"),
        ),
    ],
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2024-03-14 12:18:08,647 62058:140704275689088][log.py:55 todd.Validator.log_callback init] INFO: 
platform: macOS-14.0
nvidia_smi: None
python_version: 3.11.7 (main, Dec  4 2023, 18:10:11) [Clang 15.0.0 (clang-1500.1.0.2.5)]
pytorch_version: 2.0.1
torchvision_version: 0.15.2
opencv_version: 4.7.0
todd_version: 0.4.0
cuda_home: None
git_commit_id: 0a7955a
git_status: 
M todd/runners/callbacks/checkpoint.py
 M todd/runners/callbacks/composed.py
 M todd/runners/callbacks/git.py
 M todd/runners/callbacks/interval.py
 M todd/runners/callbacks/log.py
 M todd/runners/callbacks/lr.py
 M todd/runners/callbacks/monitor.py
 M todd/runners/callbacks/tensorboard.py
 M todd/runners/epoch_based_trainer.py
 M todd/runners/iter_based_trainer.py
 M todd/runners/strategies/base.py
 M todd/runners/strategies/ddp.py
 M todd/runners/strategies/fsdp.py
 M todd/runners/utils.py
 M todd/utils/__init__.py
 M todd/utils/mixins.py
?? todd/utils/constants.py
[2m[2024-03-14 12:18:08,648 62058:140704275689088][bas

### Git

In [16]:
config = todd.Config(
    type="Validator",
    name="git_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
    callbacks=[
        dict(type="GitCallback", diff='HEAD -- ":(exclude)*.ipynb"'),
    ],
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )

    !echo
    !cat {work_dirs}/git_callback/*.log

[2024-03-14 12:18:08,732 62058:140704275689088][git.py:41 todd.Validator.git_callback init] INFO: Saving git diff to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpotx2rbek/git_callback/git_diff_2024-03-14T12-18-08_732484-08-00.log
[2m[2024-03-14 12:18:08,735 62058:140704275689088][base.py:56 todd.Validator.git_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m



diff --git a/todd/runners/callbacks/checkpoint.py b/todd/runners/callbacks/checkpoint.py
index 09548a2..caba779 100644
--- a/todd/runners/callbacks/checkpoint.py
+++ b/todd/runners/callbacks/checkpoint.py
@@ -36,27 +36,27 @@ class CheckpointCallback(IntervalMixin, BaseCallback):
 
     def init(self, *args, **kwargs) -> None:
         super().init(*args, **kwargs)
-        self._checkpoint_dir = self._runner.work_dir / 'checkpoints'
+        self._checkpoint_dir = self.runner.work_dir / 'checkpoints'
         self._latest_checkpoint_dir = self._checkpoint_dir / 'latest'
 
         self._checkpoint_dir.mkdir(parents=True, exist_ok=True)
 
-        if self._runner._auto_resume and self._latest_checkpoint_dir.exists():
+        if self.runner._auto_resume and self._latest_checkpoint_dir.exists():
             load_from = self._latest_checkpoint_dir
-        elif self._runner.load_from is not None:
-            load_from = pathlib.Path(self._runner.load_from)
+        elif self.runner.loa

### Optimize

In [17]:
config = todd.Config(
    type="IterBasedTrainer",
    name="optimize_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[
        dict(type='OptimizeCallback'),
        dict(type='LogCallback', interval=1),
    ],
    optimizer=dict(type="SGD", lr=0.005),
    iters=8,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.run()

[2m[2024-03-14 12:18:09,033 62058:140704275689088][base.py:56 todd.IterBasedTrainer.optimize_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:09,037 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([4, 7]), 'y': tensor([ 8, 14])} weight=0.000 loss=11.000
[2024-03-14 12:18:09,040 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([6, 3]), 'y': tensor([12,  6])} weight=0.027 loss=8.876
[2024-03-14 12:18:09,042 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([1, 2]), 'y': tensor([2, 4])} weight=0.050 loss=2.925
[2024-03-14 12:18:09,043 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([ 5, 10]), 'y': tensor([10, 20])} weight=0.057 loss=14.569
[2024-

### Learning Rate Schedule

In [18]:
config = todd.Config(
    type="IterBasedTrainer",
    name="lr_schedule_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[
        dict(type="OptimizeCallback"),
        dict(
            type="LRScheduleCallback",
            lr_scheduler=dict(type="LinearLR", total_iters=5),
        ),
        dict(type="LogCallback", interval=1),
    ],
    optimizer=dict(type="SGD", lr=0.005),
    iters=8,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2024-03-14 12:18:09,063 62058:140704275689088][base.py:56 todd.IterBasedTrainer.lr_schedule_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:09,066 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([6, 2]), 'y': tensor([12,  4])} weight=0.000 loss=8.000 lr=['1.667e-03']
[2024-03-14 12:18:09,068 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([ 9, 10]), 'y': tensor([18, 20])} weight=0.007 loss=18.937 lr=['2.333e-03']
[2024-03-14 12:18:09,070 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([3, 8]), 'y': tensor([ 6, 16])} weight=0.029 loss=10.841 lr=['3.000e-03']
[2024-03-14 12:18:09,073 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [4/8] batch={'x': t

In [19]:
config = todd.Config(
    type="EpochBasedTrainer",
    name="lr_schedule_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=4),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[
        dict(type="OptimizeCallback"),
        dict(
            type="LRScheduleCallback",
            lr_scheduler=dict(type="LinearLR", total_iters=3),
            by_epoch=True,
        ),
        dict(type="LogCallback", interval=1),
    ],
    optimizer=dict(type="SGD", lr=0.005),
    epochs=5,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2m[2024-03-14 12:18:09,092 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.lr_schedule_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:09,094 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [1/5]
[2024-03-14 12:18:09,096 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [1/10] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.000 loss=5.000 lr=['1.667e-03']
[2024-03-14 12:18:09,098 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [2/10] batch={'x': tensor([4, 1]), 'y': tensor([8, 2])} weight=0.004 loss=4.990 lr=['1.667e-03']
[2024-03-14 12:18:09,099 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [2/5]
[2024-03-14 12:18:09,101 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callbac

### Learning Rate Scale

In [20]:
config = todd.Config(
    type="IterBasedTrainer",
    name="lr_scale_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[
        dict(type="OptimizeCallback"),
        dict(
            type="LRScaleCallback",
            lr_scaler=dict(base_batch_size=1),
        ),
        dict(type="LogCallback", interval=1),
    ],
    optimizer=dict(type="SGD", lr=0.005),
    iters=8,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = todd.RunnerRegistry.build(
        config,
        work_dir=dict(root=work_dirs),
    )
    runner.run()

[2024-03-14 12:18:09,131 62058:140704275689088][lr.py:93 todd.IterBasedTrainer.lr_scale_callback _scale_lr] INFO: base_batch_size=1 batch_size=2 lr_scaler=2.000
[2m[2024-03-14 12:18:09,132 62058:140704275689088][base.py:56 todd.IterBasedTrainer.lr_scale_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:09,134 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([3, 4]), 'y': tensor([6, 8])} weight=0.000 loss=7.000
[2024-03-14 12:18:09,136 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([6, 7]), 'y': tensor([12, 14])} weight=0.035 loss=12.773
[2024-03-14 12:18:09,138 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([10,  2]), 'y': tensor([20,  4])} weight=0.100 loss=11.400
[2024-03-14 12:18:09,140 62058:140704275689088][log

### Checkpoint

In [21]:
config = todd.Config(
    type="IterBasedTrainer",
    name="checkpoint_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[
        dict(type='OptimizeCallback'),
        dict(type='LogCallback', interval=1),
        dict(type="CheckpointCallback", interval=1),
    ],
    optimizer=dict(type="SGD", lr=0.005),
    iters=8,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    iter_5 = pathlib.Path(work_dirs) / 'checkpoint_callback' / 'checkpoints' / 'iter_5'
    for f in iter_5.glob('*.pth'):
        print(f'{f.name}:')
        pprint(torch.load(f, 'cpu'))
        print()

    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(
            config,
            work_dir=dict(root=work_dirs),
            load_from=str(iter_5),
        )
    runner.run()


[2m[2024-03-14 12:18:09,174 62058:140704275689088][base.py:56 todd.IterBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:09,177 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([6, 5]), 'y': tensor([12, 10])} weight=0.000 loss=11.000
[2024-03-14 12:18:09,195 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_1
[2024-03-14 12:18:09,224 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([2, 8]), 'y': tensor([ 4, 16])} weight=0.027 loss=9.863
[2024-03-14 12:18:09,233 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rw

[2024-03-14 12:18:09,274 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([1, 3]), 'y': tensor([2, 6])} weight=0.177 loss=3.645
[2024-03-14 12:18:09,276 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_7
[2024-03-14 12:18:09,281 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([5, 8]), 'y': tensor([10, 16])} weight=0.187 loss=11.781
[2024-03-14 12:18:09,282 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_8



[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460[0m
└── [1;36mcheckpoint_callback[0m
    └── [1;36mcheckpoints[0m
        ├── [1;36miter_1[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_2[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_3[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_4[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_5[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_6[0m
 

[2024-03-14 12:18:09,715 62058:140704275689088][checkpoint.py:54 todd.IterBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_5
[2024-03-14 12:18:09,718 62058:140704275689088][base.py:65 todd.IterBasedTrainer.checkpoint_callback load_model_state_dict] INFO: <All keys matched successfully>
[2m[2024-03-14 12:18:09,719 62058:140704275689088][base.py:56 todd.IterBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:09,722 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([10,  1]), 'y': tensor([20,  2])} weight=0.137 loss=10.244
[2024-03-14 12:18:09,723 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/che

strategy.pth:
{}

optim.pth:
{'param_groups': [{'dampening': 0,
                   'differentiable': False,
                   'foreach': None,
                   'lr': 0.005,
                   'maximize': False,
                   'momentum': 0,
                   'nesterov': False,
                   'params': [0],
                   'weight_decay': 0}],
 'state': {0: {'momentum_buffer': None}}}

meta.pth:
{'iter_': 5}

model.pth:
OrderedDict([('_weight', tensor(0.1375))])

callbacks.pth:
{'callbacks': [{}, {}, {}]}



In [22]:
config = todd.Config(
    type="EpochBasedTrainer",
    name="checkpoint_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[
        dict(type='OptimizeCallback'),
        dict(type='LogCallback', interval=1),
        dict(type="CheckpointCallback", interval=2),
    ],
    optimizer=dict(type="SGD", lr=0.005),
    epochs=3,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    iter_8 = pathlib.Path(work_dirs) / 'checkpoint_callback' / 'checkpoints' / 'iter_8'
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(
            config,
            work_dir=dict(root=work_dirs),
            load_from=str(iter_8),
        )
    runner.run()

    !echo
    !echo {'-' * 20}
    !echo

    iter_10 = pathlib.Path(work_dirs) / 'checkpoint_callback' / 'checkpoints' / 'iter_10'
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(
            config,
            work_dir=dict(root=work_dirs),
            load_from=str(iter_10),
        )
    runner.run()


[2m[2024-03-14 12:18:09,769 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:09,769 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [1/3]
[2024-03-14 12:18:09,772 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/15] batch={'x': tensor([5, 3]), 'y': tensor([10,  6])} weight=0.000 loss=8.000
[2024-03-14 12:18:09,774 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/15] batch={'x': tensor([2, 6]), 'y': tensor([ 4, 12])} weight=0.020 loss=7.920
[2024-03-14 12:18:09,775 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_2
[2024-03-14 12:18:09,779 62058


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9[0m
└── [1;36mcheckpoint_callback[0m
    └── [1;36mcheckpoints[0m
        ├── [1;36miter_10[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_12[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_14[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_2[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_4[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36miter_6[0

[2024-03-14 12:18:10,289 62058:140704275689088][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_8
[2024-03-14 12:18:10,293 62058:140704275689088][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: <All keys matched successfully>
[2m[2024-03-14 12:18:10,294 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:10,295 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [2/3]
[2024-03-14 12:18:10,298 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [9/15] batch={'x': tensor([3, 8]), 'y': tensor([ 6, 16])} weight=0.215 loss=9.818
[2024-03-14 12:18:10,300 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.check


--------------------



[2024-03-14 12:18:10,730 62058:140704275689088][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_10
[2024-03-14 12:18:10,734 62058:140704275689088][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: <All keys matched successfully>
[2m[2024-03-14 12:18:10,734 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:10,736 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]
[2024-03-14 12:18:10,740 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([7, 6]), 'y': tensor([14, 12])} weight=0.265 loss=11.278
[2024-03-14 12:18:10,742 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.ch

In [23]:
config = todd.Config(
    type="EpochBasedTrainer",
    name="checkpoint_callback",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[
        dict(type='OptimizeCallback'),
        dict(type='LogCallback', interval=1),
        dict(type="CheckpointCallback", interval=1, by_epoch=True),
    ],
    optimizer=dict(type="SGD", lr=0.005),
    epochs=3,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.run()

    !echo
    !tree {work_dirs}
    !echo

    epoch_2 = pathlib.Path(work_dirs) / 'checkpoint_callback' / 'checkpoints' / 'epoch_2'
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(
            config,
            work_dir=dict(root=work_dirs),
            load_from=str(epoch_2),
        )
    runner.run()


[2m[2024-03-14 12:18:10,784 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:10,785 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [1/3]
[2024-03-14 12:18:10,789 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/15] batch={'x': tensor([4, 5]), 'y': tensor([ 8, 10])} weight=0.000 loss=9.000
[2024-03-14 12:18:10,793 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/15] batch={'x': tensor([3, 6]), 'y': tensor([ 6, 12])} weight=0.022 loss=8.899
[2024-03-14 12:18:10,797 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [3/15] batch={'x': tensor([ 9, 10]), 'y': tensor([18, 20])} weight=0.045 loss=18.572
[2024-03-14 12:18:10,799 62058:140704275689088][log.py:93 t


[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmplwjo14oz[0m
└── [1;36mcheckpoint_callback[0m
    └── [1;36mcheckpoints[0m
        ├── [1;36mepoch_1[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36mepoch_2[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        ├── [1;36mepoch_3[0m
        │   ├── callbacks.pth
        │   ├── meta.pth
        │   ├── model.pth
        │   ├── optim.pth
        │   └── strategy.pth
        └── [35mlatest[0m -> [1;36mepoch_3[0m

7 directories, 15 files



[2024-03-14 12:18:11,264 62058:140704275689088][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmplwjo14oz/checkpoint_callback/checkpoints/epoch_2
[2024-03-14 12:18:11,267 62058:140704275689088][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: <All keys matched successfully>
[2m[2024-03-14 12:18:11,268 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:11,269 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]
[2024-03-14 12:18:11,279 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([6, 2]), 'y': tensor([12,  4])} weight=0.275 loss=6.900
[2024-03-14 12:18:11,291 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.che

### Monitor

In [24]:
class CustomError(RuntimeError):
    pass

In [25]:
@todd.RunnerRegistry.register_()
class FaultyValidator(todd.runners.Validator):

    def _run_iter(self, *args, **kwargs) -> NoReturn:
        raise CustomError("faulty runner")

In [26]:
config = todd.Config(
    type='FaultyValidator',
    name='monitor_callback',
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=20),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=1),
    callbacks=[
        dict(type='MonitorCallback'),
        dict(type='LogCallback', interval=5, with_file_handler=True),
    ],
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    try:
        runner.run()
    except CustomError as e:
        pass

    !echo
    !cat {work_dirs}/monitor_callback/*.log


[2m[2024-03-14 12:18:11,358 62058:140704275689088][base.py:56 todd.FaultyValidator.monitor_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[1;31m[2024-03-14 12:18:11,359 62058:140704275689088][monitor.py:26 todd.FaultyValidator.monitor_callback __exit__] ERROR: Unable to run iter_=1
batch={'x': tensor([1]), 'y': tensor([2])}
memo={'dataloader': <torch.utils.data.dataloader.DataLoader object at 0x152023210>}
Traceback (most recent call last):
  File "/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/base.py", line 246, in _run
    memo = self._run_iter(batch, memo)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_62058/1715875531.py", line 5, in _run_iter
    raise CustomError("faulty runner")
CustomError: faulty runner[m



[2024-03-14 12:18:11,358 62058:140704275689088][base.py:56 todd.FaultyValidator.monitor_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R
[2024-03-14 12:18:11,359 62058:140704275689088][monitor.py:26 todd.FaultyValidator.monitor_callback __exit__] ERROR: Unable to run iter_=1
batch={'x': tensor([1]), 'y': tensor([2])}
memo={'dataloader': <torch.utils.data.dataloader.DataLoader object at 0x152023210>}
Traceback (most recent call last):
  File "/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/base.py", line 246, in _run
    memo = self._run_iter(batch, memo)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_62058/1715875531.py", line 5, in _run_iter
    raise CustomError("faulty runner")
CustomError: faulty runner


### Priorities

## Strategies

In [27]:
config = todd.Config(
    type="EpochBasedTrainer",
    name="strategy_load_model_from",
    strategy=dict(type='BaseStrategy'),
    dataset=dict(type='RunnerDataset', n=10),
    model=dict(type='RunnerModel'),
    dataloader=dict(batch_size=2, shuffle=True),
    callbacks=[
        dict(type='OptimizeCallback'),
        dict(type='LogCallback', interval=1),
        dict(type="CheckpointCallback", interval=1, by_epoch=True),
    ],
    optimizer=dict(type="SGD", lr=0.005),
    epochs=3,
)
with tempfile.TemporaryDirectory() as work_dirs:
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.run()

    !echo
    !echo {'-' * 20}
    !echo

    epoch_2 = (pathlib.Path(work_dirs) / 'strategy_load_model_from' / 'checkpoints' / 'epoch_2' / 'model.pth')
    runner: todd.runners.BaseRunner = \
        todd.RunnerRegistry.build(config, work_dir=dict(root=work_dirs))
    runner.strategy.load_model_from(epoch_2)
    runner.run()


[2m[2024-03-14 12:18:11,674 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.strategy_load_model_from __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:11,675 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [1/3]
[2024-03-14 12:18:11,679 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [1/15] batch={'x': tensor([2, 5]), 'y': tensor([ 4, 10])} weight=0.000 loss=7.000
[2024-03-14 12:18:11,682 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [2/15] batch={'x': tensor([ 7, 10]), 'y': tensor([14, 20])} weight=0.018 loss=16.851
[2024-03-14 12:18:11,684 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [3/15] batch={'x': tensor([1, 3]), 'y': tensor([2, 6])} weight=0.060 loss=3.880
[2024-03-14 12:18:11,687 62058:14070


--------------------



[2m[2024-03-14 12:18:12,135 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.strategy_load_model_from __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R[m
[2024-03-14 12:18:12,135 62058:140704275689088][base.py:80 todd.EpochBasedTrainer.strategy_load_model_from load_model_from] INFO: Loading model from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpj2udcidm/strategy_load_model_from/checkpoints/epoch_2/model.pth
[2024-03-14 12:18:12,138 62058:140704275689088][base.py:65 todd.EpochBasedTrainer.strategy_load_model_from load_model_state_dict] INFO: <All keys matched successfully>
[2024-03-14 12:18:12,139 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [1/3]
[2024-03-14 12:18:12,142 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [1/15] batch={'x': tensor([10,  1]), 'y': tensor([20,  2])} weight=0.275 loss=9.488
[2024-03-14 12:18:12,143 62058:1

## Dry Run

In [28]:
todd.Store.DRY_RUN = True