In [1]:
import os
from clearml import Task

from dotenv import load_dotenv

PROJECT_ROOT = os.path.abspath(os.path.join(os.curdir, os.pardir))

print(f"Project root: {PROJECT_ROOT}")

Project root: /home/aleksey/Dev/MLOpsSandbox/clearml


In [2]:
load_dotenv();

# You can use next string for ClearML credentials for local execution but there are some issues with remote execution
# os.environ["CLEARML_CONFIG_FILE"] = os.path.join(PROJECT_ROOT, "clearml.conf")

In [None]:
# Modified example from https://github.com/catalyst-team/catalyst#getting-started

import os
from torch import nn, optim
from torch.utils.data import DataLoader
from catalyst import dl, utils
from catalyst.contrib.datasets import MNIST


def start_training(clearml_task: Task):
    model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.02)
    loaders = {
        "train": DataLoader(MNIST(os.getcwd(), train=True), batch_size=32),
        "valid": DataLoader(MNIST(os.getcwd(), train=False), batch_size=32),
    }

    runner = dl.SupervisedRunner(
        input_key="features", output_key="logits", target_key="targets", loss_key="loss"
    )

    num_epochs = 100

    if clearml_task.running_locally():
        print('Running locally')
        num_epochs = 0
    else:
        print('Running remotely')

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        loaders=loaders,
        num_epochs=num_epochs,
        callbacks=[
            dl.AccuracyCallback(input_key="logits", target_key="targets", topk=(1, 3, 5)),
            dl.PrecisionRecallF1SupportCallback(input_key="logits", target_key="targets"),
        ],
        logdir="./logs",
        valid_loader="valid",
        valid_metric="loss",
        minimize_valid_metric=True,
        verbose=True,
    )

In [None]:
clearml_task = Task.init(project_name="sample_project", reuse_last_task_id=False)

try:
    start_training(clearml_task=clearml_task)
    clearml_task.execute_remotely(queue_name="my-queue")
    # Should be after remote execution under `try`
    clearml_task.mark_completed()
except Exception as e:
    print(f"Failed to execute: {e}")
    clearml_task.mark_failed()