In [1]:
import os
import sys
import logging
import datetime
from functools import partial

import torch
import numpy as np
import numpy.typing as npt
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from torch.utils.tensorboard.writer import SummaryWriter

from utils.stoppers import EarlyStopper
from models.delightcnn.dataset import DelightDataset, DelightDatasetOptions
from models.delightcnn.training import (
    ray_wrapper_training_function,
    TrainingOptions,
)

logging.basicConfig(
    format="[%(asctime)s %(levelname)s]: %(message)s",
    level=logging.INFO,
    stream=sys.stderr,
)

In [2]:
class TrainingSetProcessor:
    def __init__(self, source: str, balance: bool = False):
        self._source = source
        self._balanced_indexes: npt.NDArray[np.int32] | None = None
        if balance:
            self._balanced_indexes = np.random.shuffle(self._get_balanced_indexes())

    def _get_balanced_indexes(self) -> npt.NDArray[np.int32]:
        id_train_filepath = os.path.join(self._source, "id_train.npy")
        id_train: npt.NDArray[np.str_] = np.load(id_train_filepath, allow_pickle=True)
        idxAsiago = np.array(
            [i for i in range(id_train.shape[0]) if id_train[i][:2] == "SN"]
        )
        idxZTF = np.array(
            [i for i in range(id_train.shape[0]) if id_train[i][:3] == "ZTF"]
        )
        nimb = int(idxZTF.shape[0] / idxAsiago.shape[0])

        idxbal = np.array([], dtype=int)
        for i in range(nimb + 1):
            idxbal = np.concatenate([idxbal, idxAsiago])
            idxbal = np.concatenate(
                [
                    idxbal,
                    idxZTF[
                        i * idxAsiago.shape[0] : min(
                            idxZTF.shape[0], (i + 1) * idxAsiago.shape[0]
                        )
                    ],
                ]
            )

        return idxbal

    @property
    def X(self) -> npt.NDArray[np.float32]:
        x_train_filepath = os.path.join(self._source, "X_train.npy")
        X_train: npt.NDArray[np.float32] = np.load(x_train_filepath)

        if self._balanced_indexes is not None:
            X_train = X_train[self._balanced_indexes]

        return X_train.swapaxes(3, 1).swapaxes(2, 3)

    @property
    def y(self) -> npt.NDArray[np.float32]:
        y_train_filepath = os.path.join(self._source, "y_train.npy")
        y_train: npt.NDArray[np.float32] = np.load(y_train_filepath)

        if self._balanced_indexes is not None:
            y_train = y_train[self._balanced_indexes]

        return y_train


class ValidationSetProcessor:
    def __init__(self, source: str, pixscale_mask_value: float | None = None):
        self._source = source
        self._pixscale_mask: npt.NDArray[np.int32] | None = None
        if pixscale_mask_value is not None:
            self._pixscale_mask = self._get_distance_mask(pixscale_mask_value)

    def _get_distance_mask(self, pixscale: float) -> npt.NDArray[np.int32]:
        y_validation_filepath = os.path.join(self._source, "y_validation.npy")
        y_validation: npt.NDArray[np.float32] = np.load(y_validation_filepath)

        distance = np.sqrt(np.sum(y_validation**2, axis=1))
        return (distance * pixscale) < 60

    @property
    def X(self) -> npt.NDArray[np.float32]:
        x_validation_filepath = os.path.join(self._source, "X_validation.npy")
        X_validation: npt.NDArray[np.float32] = np.load(x_validation_filepath)

        if self._pixscale_mask is not None:
            X_validation = X_validation[self._pixscale_mask]

        return X_validation.swapaxes(3, 1).swapaxes(2, 3)

    @property
    def y(self) -> npt.NDArray[np.float32]:
        y_validation_filepath = os.path.join(self._source, "y_validation.npy")
        y_validation: npt.NDArray[np.float32] = np.load(y_validation_filepath)

        if self._pixscale_mask is not None:
            y_validation = y_validation[self._pixscale_mask]

        return y_validation


class TestingSetProcessor:
    def __init__(self, source: str):
        self._source = source

    @property
    def X(self) -> npt.NDArray[np.float32]:
        x_test_filepath = os.path.join(self._source, "X_test.npy")
        x_test: npt.NDArray[np.float32] = np.load(x_test_filepath)
        return x_test.swapaxes(3, 1).swapaxes(2, 1)

    @property
    def y(self) -> npt.NDArray[np.float32]:
        y_test_filepath = os.path.join(self._source, "y_test.npy")
        return np.load(y_test_filepath)


class ProductionTrainingSetProcessor:
    def __init__(self, source: str):
        self._source = source
        self._training_set = TrainingSetProcessor(source)
        self._validation_set = ValidationSetProcessor(source)

    @property
    def X(self) -> npt.NDArray[np.float32]:
        return np.concatenate((self._training_set.X, self._validation_set.X))

    @property
    def y(self) -> npt.NDArray[np.float32]:
        return np.concatenate((self._training_set.y, self._validation_set.y))

In [3]:
# Dataset settigns
source = os.path.join(os.getcwd(), "data")
dataset_options = DelightDatasetOptions(channels=1, levels=5, rot=True, flip=True)
balance_training_set = True
validation_pixscale_mask_value = 0.25

# Training settings
device: torch.device = torch.device("mps")
epochs = 10
batch_size = 32
adam_learning_rate = 0.0014
adam_weight_decay = 1e-4
criterion = torch.nn.MSELoss()
optimizer = partial(
    torch.optim.Adam,  # type: ignore
    lr=adam_learning_rate,
    weight_decay=adam_weight_decay,
)
stopper = EarlyStopper(patience=3, min_delta=0)
writter = SummaryWriter()

train_dataset = DelightDataset(
    processor=TrainingSetProcessor(source, balance=balance_training_set),
    options=dataset_options,
)
val_dataset = DelightDataset(
    processor=ValidationSetProcessor(
        source, pixscale_mask_value=validation_pixscale_mask_value
    ),
    options=dataset_options,
)

training_options = TrainingOptions(
    criterion=criterion,
    dataset_options=dataset_options,
    optimizer=optimizer,  # type: ignore
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    epochs=epochs,
    device=device,
)

In [4]:
def run_ray_tune(
    *,
    name: str,
    num_samples: int,
    gpus_per_trial: float,
    training_options: TrainingOptions,
):
    param_space = {
        "nconv1": tune.lograndint(16, 64 + 1),
        "nconv2": tune.lograndint(16, 64 + 1),
        "nconv3": tune.lograndint(16, 64 + 1),
        "ndense": tune.lograndint(256, 2048 + 1),
        "dropout": tune.uniform(0, 0.4),
        "batch_size": tune.lograndint(16, 64 + 1),
    }

    scheduler = ASHAScheduler(
        grace_period=20,  # epochs before evaluate early stop
        reduction_factor=3,  # the worst 1/3 trials will be terminated
        brackets=1,  # we don't want to decrease resources
    )

    train_fn = partial(ray_wrapper_training_function, training_options=training_options)

    tuner = tune.Tuner(
        tune.with_resources(train_fn, resources={"gpu": gpus_per_trial}),  # type: ignore
        tune_config=tune.TuneConfig(
            metric="val_loss", mode="min", scheduler=scheduler, num_samples=num_samples
        ),
        run_config=train.RunConfig(name=name),
        param_space=param_space,
    )
    return tuner.fit()

In [5]:
now = datetime.datetime.now()
name = f"ray_experiment_{now.strftime('%d_%m_%Y-%H_%M_%S')}"
num_samples = 200

result = run_ray_tune(
    name=name,
    num_samples=num_samples,
    gpus_per_trial=0.2,
    training_options=training_options,
)

0,1
Current time:,2024-10-28 18:17:06
Running for:,00:10:38.41
Memory:,12.2/16.0 GiB

Trial name,status,loc,batch_size,dropout,nconv1,nconv2,nconv3,ndense
ray_wrapper_training_function_7fbee_00000,PENDING,,23,0.275361,49,34,48,1159
ray_wrapper_training_function_7fbee_00001,PENDING,,18,0.24228,34,32,54,450
ray_wrapper_training_function_7fbee_00002,PENDING,,35,0.375676,30,41,37,409
ray_wrapper_training_function_7fbee_00003,PENDING,,21,0.284327,19,51,29,689
ray_wrapper_training_function_7fbee_00004,PENDING,,59,0.256329,42,32,46,1379
ray_wrapper_training_function_7fbee_00005,PENDING,,30,0.351876,48,26,31,333
ray_wrapper_training_function_7fbee_00006,PENDING,,45,0.0165516,16,33,30,507
ray_wrapper_training_function_7fbee_00007,PENDING,,37,0.165303,17,39,57,382
ray_wrapper_training_function_7fbee_00008,PENDING,,29,0.21753,24,22,53,521
ray_wrapper_training_function_7fbee_00009,PENDING,,45,0.106236,55,16,19,1472


[36m(autoscaler +2m55s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[33m(autoscaler +2m55s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +3m30s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +4m5s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +4m40s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +5m16s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +5m51s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +6m26s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +7m1s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +7m36s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +8m11s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +8m47s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +9m27s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +10m2s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +10m37s)[0m Error: No available node types can fulfill resource request {'GPU': 0.2}. Add suitable node types to this cluster to resolve this issue.


2024-10-28 18:17:06,814	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/keviinplz/ray_results/ray_experiment_28_10_2024-18_06_24' in 0.0130s.
2024-10-28 18:17:06,901	INFO tune.py:1041 -- Total run time: 639.45 seconds (638.39 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/Users/keviinplz/ray_results/ray_experiment_28_10_2024-18_06_24", trainable=...)
- ray_wrapper_training_function_7fbee_00000: FileNotFoundError('Could not fetch metrics for ray_wrapper_training_function_7fbee_00000: both result.json and progress.csv were not found at /Users/keviinplz/ray_results/ray_experiment_28_10_2024-18_06_24/ray_wrapper_training_function_7fbee_00000_0_batch_size=23,dropout=0.2754,nconv1=49,nconv2=34,nconv3=48,ndense=1159_2024-10-28_18-09-10')
- ray_wrapper_training_function_7fbee_00001: FileNotFoundError('Could not fetch metrics for ray_wrapper_training_function_7fbee_00001: both result.json and progress.csv were not fou