Define settings

In [1]:
from mltrainer import ReportTypes, Trainer, TrainerSettings, metrics
from pathlib import Path
from loguru import logger
from torch import optim, nn
from hyperopt import hp

batch_size = 64


  import pkg_resources


Get dataset

In [2]:
from mads_datasets import DatasetFactoryProvider, DatasetType
from mltrainer.preprocessors import BasePreprocessor
from typing import Iterator

def get_fashion_streamers(batchsize: int) -> tuple[Iterator, Iterator]:
    fashionfactory = DatasetFactoryProvider.create_factory(DatasetType.FASHION)
    preprocessor = BasePreprocessor()
    streamers = fashionfactory.create_datastreamer(
        batchsize=batchsize, preprocessor=preprocessor
    )
    train = streamers["train"]
    valid = streamers["valid"]
    trainstreamer = train.stream()
    validstreamer = valid.stream()
    return trainstreamer, validstreamer

def get_flowers_streamers(batchsize: int) -> tuple[Iterator, Iterator]:
    flowersfactory = DatasetFactoryProvider.create_factory(DatasetType.FLOWERS)
    preprocessor = BasePreprocessor()
    streamers = flowersfactory.create_datastreamer(
        batchsize=batchsize, preprocessor=preprocessor
    )
    train = streamers["train"]
    valid = streamers["valid"]
    trainstreamer = train.stream()
    validstreamer = valid.stream()
    return trainstreamer, validstreamer

In [3]:
trainstreamer_fashion, validstreamer_fashion = get_fashion_streamers(batch_size)
trainstreamer_flowers, validstreamer_flowers = get_flowers_streamers(batch_size)

x_fashion, y_fashion = next(trainstreamer_fashion)
x_flowers, y_flowers = next(trainstreamer_flowers)

logger.info(f"Fashion images shape: {x_fashion.shape}, labels shape: {y_fashion.shape}")
logger.info(f"Flowers images shape: {x_flowers.shape}, labels shape: {y_flowers.shape}")

[32m2025-09-21 11:12:08.941[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /home/KIEI/.cache/mads_datasets/fashionmnist[0m
[32m2025-09-21 11:12:08.942[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /home/KIEI/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m
[32m2025-09-21 11:12:08.980[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /home/KIEI/.cache/mads_datasets/flowers[0m
[32m2025-09-21 11:12:22.811[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mFashion images shape: torch.Size([64, 1, 28, 28]), labels shape: torch.Size([64])[0m
[32m2025-09-21 11:12:22.812[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mFlowers images shape: torch.Size([64, 3, 224, 224]), labels shape: torch.Size([64])[0m


Set machine type

In [4]:
import torch

def get_device() -> str:
    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
        return "mps"
    elif torch.cuda.is_available():
        return "cuda:0"
    else:
        return "cpu"

device = get_device()
device

'cpu'

Setup MLFlow & work directory

In [5]:
import mlflow
from pathlib import Path

def setup_mlflow(experiment_path: str) -> None:
    mlflow.set_tracking_uri("sqlite:///mlflow.db")
    mlflow.set_experiment(experiment_path)

setup_mlflow(experiment_path="experiment")

def set_model_dir(model_dir: str) -> None:
    modeldir = Path(model_dir).resolve()
    if not modeldir.exists():
        modeldir.mkdir(parents=True)
        logger.info(f"Created {modeldir}")
    return modeldir

model_dir = set_model_dir(model_dir="models")

2025/09/21 11:12:24 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/09/21 11:12:24 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

Setup model

In [None]:
class CustomCNN(nn.Module):
    def __init__(self, 
            filters, 
            units1=128, 
            units2=64,
            kernel_size=3, 
            pooling_layer=nn.MaxPool2d,
            padding="valid",
            input_size=(batch_size, 1, 28, 28)
        ):
        super().__init__()
        self.in_channels = input_size[1]
        self.input_size = input_size
        self.filters = filters
        self.units1 = units1
        self.units2 = units2

        self.convolutions = nn.Sequential(
            nn.Conv2d(self.in_channels, filters, kernel_size=kernel_size, stride=1, padding=padding),
            nn.ReLU(),
            pooling_layer(kernel_size=2),
            nn.Conv2d(filters, filters, kernel_size=kernel_size, stride=1, padding=padding),
            nn.ReLU(),
            pooling_layer(kernel_size=2),
            nn.Conv2d(filters, filters, kernel_size=kernel_size, stride=1, padding=padding),
            nn.ReLU(),
            pooling_layer(kernel_size=2),
        )

        activation_map_size = self._conv_test(input_size)
        logger.info(f"Aggregating activationmap with size {activation_map_size}")
        self.agg = nn.AvgPool2d(activation_map_size)

        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(filters, units1),
            nn.ReLU(),
            nn.Linear(units1, units2),
            nn.ReLU(),
            nn.Linear(units2, 10),
        )

    def _conv_test(self, input_size=(batch_size, 1, 28, 28)):
        x = torch.ones(input_size)
        x = self.convolutions(x)
        return x.shape[-2:]

    def forward(self, x):
        x = self.convolutions(x)
        x = self.agg(x)
        logits = self.dense(x)
        return logits


Setup training

In [None]:
from fastapi import params
from hyperopt import fmin, STATUS_OK, tpe, Trials
from datetime import datetime

def objective(params):
    # End any previous MLflow run if still active
    if mlflow.active_run() is not None:
        mlflow.end_run()
    with mlflow.start_run():
        mlflow.set_experiment(experiment_name)
        mlflow.set_tag("model", "CNN")
        # Log parameters specific to this model
        mlflow.log_params({
            "batch_size": batch_size,
            "epochs": params["epochs"],
            "filters": params["filters"],
            "kernel_size": params["kernel_size"],
            "pooling_method": params["pooling_method"],
            "units1": params["units1"],
            "units2": params["units2"],
        })
        
        filters = params["filters"]
        kernel_size = params["kernel_size"]
        pooling_layer = params["pooling_method"]
        units1 = params["units1"]
        units2 = params["units2"]
        epochs = params["epochs"]
        padding = params["padding"]

        model = CustomCNN(
            filters=filters,
            units1=units1,
            units2=units2,
            kernel_size=kernel_size,
            pooling_layer=pooling_layer,
            padding=padding,
            input_size=(batch_size, 1, 28, 28),
        ).to(device)

        # Print model summary
        logger.info(model)
        # Print input size
        logger.info(f"Input size: {x_fashion.shape}")
        # Pass a batch through each layer and log the shape
        with torch.no_grad():
            x = x_fashion.to(device)
            for i, layer in enumerate(model.convolutions):
                x = layer(x)
                logger.info(f"After convolutions[{i}] ({layer.__class__.__name__}): {x.shape}")
            x = model.agg(x)
            logger.info(f"After agg (AvgPool2d): {x.shape}")
            x = model.dense[0](x)  # Flatten
            logger.info(f"After dense[0] (Flatten): {x.shape}")
            x = model.dense[1](x)  # Linear
            logger.info(f"After dense[1] (Linear): {x.shape}")
            x = model.dense[2](x)  # ReLU
            logger.info(f"After dense[2] (ReLU): {x.shape}")
            x = model.dense[3](x)  # Linear
            logger.info(f"After dense[3] (Linear): {x.shape}")
            x = model.dense[4](x)  # ReLU
            logger.info(f"After dense[4] (ReLU): {x.shape}")
            x = model.dense[5](x)  # Linear
            logger.info(f"After dense[5] (Linear): {x.shape}")

        train_settings = TrainerSettings(
            epochs=epochs,
            reporttypes=[ReportTypes.MLFLOW],
            metrics=[metrics.Accuracy()],
            logdir=model_dir,
            train_steps=100,
            valid_steps=100,
        )

        # Initialize the Trainer with the current model and settings
        trainer = Trainer(
            model=model,
            optimizer=optim.Adam,
            loss_fn=torch.nn.CrossEntropyLoss(),  # CrossEntropyLoss
            scheduler=optim.lr_scheduler.ReduceLROnPlateau,  # Learning rate scheduler
            traindataloader=trainstreamer_fashion,  # Training data streamer
            validdataloader=validstreamer_fashion,  # Validation data streamer
            settings=train_settings,  # Training settings
            device=device,  # Device to run on (cpu/cuda/mps)
        )
        trainer.loop()  # Run the training loop

        # Save the trained model with a timestamp
        tag = datetime.now().strftime("%Y%m%d-%H%M")
        modelpath = model_dir / (tag + "model.pt")
        logger.info(f"Saving model to {modelpath}")
        torch.save(model, modelpath)

        # Log the saved model as an artifact in MLflow
        mlflow.log_artifact(local_path=str(modelpath), artifact_path="pytorch_models")
        return {"loss": trainer.test_loss, "status": STATUS_OK}

search_space = {
    "filters": hp.choice("filters", [8, 16, 32, 64]),
    "kernel_size": hp.choice("kernel_size", [2, 3]),
    "pooling_method": hp.choice("pooling_method", [nn.MaxPool2d, nn.AvgPool2d]),
    "padding": hp.choice("padding", ["same", "valid"]),
    "units1": hp.choice("units1", [128]),
    "units2": hp.choice("units2", [64]),
    "epochs": hp.choice("epochs", [10]),
}

experiment_name = "Experiment gridsearch CNN 28x28"

best_result = fmin(
    fn=objective,
    space=search_space,
    max_evals=20,
    algo=tpe.suggest,
    trials=Trials()
)

logger.info(f"Best result: {best_result}")

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

[32m2025-09-21 11:22:30.795[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m30[0m - [1mAggregating activationmap with size torch.Size([1, 1])[0m
[32m2025-09-21 11:22:30.796[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m40[0m - [1mCustomCNN(
  (convolutions): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (agg): AvgPool2d(kernel_size=torch.Size([1, 1]), stride=torch.Size([1, 1]), padding=0)
  (dense): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=64, out_features=128,

 10%|█         | 1/10 [00:28<04:19, 28.86s/trial, best loss: 0.5117604771256447]

[32m2025-09-21 11:22:59.670[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m30[0m - [1mAggregating activationmap with size torch.Size([1, 1])[0m
[32m2025-09-21 11:22:59.671[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m40[0m - [1mCustomCNN(
  (convolutions): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): AvgPool2d(kernel_size=2, stride=2, padding=0)
  )
  (agg): AvgPool2d(kernel_size=torch.Size([1, 1]), stride=torch.Size([1, 1]), padding=0)
  (dense): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=64, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=64, bias=True

 20%|██        | 2/10 [00:56<03:43, 27.94s/trial, best loss: 0.5117604771256447]

[32m2025-09-21 11:23:26.966[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m30[0m - [1mAggregating activationmap with size torch.Size([1, 1])[0m
[32m2025-09-21 11:23:26.967[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m40[0m - [1mCustomCNN(
  (convolutions): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): AvgPool2d(kernel_size=2, stride=2, padding=0)
  )
  (agg): AvgPool2d(kernel_size=torch.Size([1, 1]), stride=torch.Size([1, 1]), padding=0)
  (dense): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=64, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=64, bias=True

 30%|███       | 3/10 [01:23<03:12, 27.47s/trial, best loss: 0.5117604771256447]

[32m2025-09-21 11:23:53.879[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m30[0m - [1mAggregating activationmap with size torch.Size([1, 1])[0m
[32m2025-09-21 11:23:53.880[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m40[0m - [1mCustomCNN(
  (convolutions): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): AvgPool2d(kernel_size=2, stride=2, padding=0)
  )
  (agg): AvgPool2d(kernel_size=torch.Size([1, 1]), stride=torch.Size([1, 1]), padding=0)
  (dense): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=64, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=64, bias=True

 40%|████      | 4/10 [01:49<02:42, 27.12s/trial, best loss: 0.5117604771256447]

[32m2025-09-21 11:24:20.463[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m30[0m - [1mAggregating activationmap with size torch.Size([2, 2])[0m
[32m2025-09-21 11:24:20.464[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m40[0m - [1mCustomCNN(
  (convolutions): Sequential(
    (0): Conv2d(1, 64, kernel_size=(2, 2), stride=(1, 1))
    (1): ReLU()
    (2): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (3): Conv2d(64, 64, kernel_size=(2, 2), stride=(1, 1))
    (4): ReLU()
    (5): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (6): Conv2d(64, 64, kernel_size=(2, 2), stride=(1, 1))
    (7): ReLU()
    (8): AvgPool2d(kernel_size=2, stride=2, padding=0)
  )
  (agg): AvgPool2d(kernel_size=torch.Size([2, 2]), stride=torch.Size([2, 2]), padding=0)
  (dense): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=64, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=64, bias=True

 50%|█████     | 5/10 [02:15<02:12, 26.49s/trial, best loss: 0.5117604771256447]

[32m2025-09-21 11:24:45.850[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m30[0m - [1mAggregating activationmap with size torch.Size([1, 1])[0m
[32m2025-09-21 11:24:45.852[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m40[0m - [1mCustomCNN(
  (convolutions): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): AvgPool2d(kernel_size=2, stride=2, padding=0)
  )
  (agg): AvgPool2d(kernel_size=torch.Size([1, 1]), stride=torch.Size([1, 1]), padding=0)
  (dense): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=64, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=64, bias=True

 50%|█████     | 5/10 [02:18<02:18, 27.64s/trial, best loss: 0.5117604771256447]


KeyboardInterrupt: 