In [1]:
from functools import partial
import os
import tempfile
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray import train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
import ray.cloudpickle as pickle
from ray.tune.syncer import SyncConfig
import shutil

from DataObjects import DataLoader
from Architectures.SimpleCNN import SimpleCNN
from Architectures.OptimalCNN import OptimalCNN
from Architectures.StochasticDepthCNN import StochasticDepthCNN

from typing import Optional, Tuple

from utils import save_model, load_model


In [2]:
def train_model(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader,
                num_epochs: int = 10, lr: float = 0.001,
                device: torch.device = None) -> None:
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    criterion: nn.Module = nn.CrossEntropyLoss()
    optimizer: torch.optim.Optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        train_loss: float = 0.0
        train_correct: int = 0
        total_train: int = 0

        for batch in train_loader:
            inputs = batch.data.to(device)
            labels = batch.labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            train_correct += torch.sum(preds == labels).item()
            total_train += labels.size(0)

        avg_train_loss = train_loss / total_train
        train_acc = train_correct / total_train

        model.eval()
        val_loss: float = 0.0
        val_correct: int = 0
        total_val: int = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = batch.data.to(device)
                labels = batch.labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += torch.sum(preds == labels).item()
                total_val += labels.size(0)

        avg_val_loss = val_loss / total_val
        val_acc = val_correct / total_val

        print(f"Epoch {epoch+1}/{num_epochs} - Train loss: {avg_train_loss:.4f}, Train acc: {train_acc:.4f} | Val loss: {avg_val_loss:.4f}, Val acc: {val_acc:.4f}")

def infer(model: nn.Module, data_loader: DataLoader,
          device: torch.device = None) -> list:
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    predictions: list = []

    with torch.no_grad():
        for batch in data_loader:
            inputs = batch.data.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            predictions.extend(preds.cpu().tolist())

    return predictions


def evaluate(model: nn.Module,
             test_loader: Optional[DataLoader] = None,
             device: Optional[torch.device] = None) -> Tuple[float, float]:

    if test_loader is None:
        test_dir = os.path.join("Data", "Data_converted", "test")
        test_loader = DataLoader(test_dir, batch_size=64, shuffle=True)

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    criterion = nn.CrossEntropyLoss()
    model.eval()

    test_loss = 0.0
    test_correct = 0
    total_test = 0

    with torch.no_grad():
        for batch in test_loader:
            inputs = batch.data.to(device)
            labels = batch.labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            test_correct += torch.sum(preds == labels).item()
            total_test += labels.size(0)

    avg_test_loss = test_loss / total_test
    test_acc = test_correct / total_test

    print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
    return avg_test_loss, test_acc

In [3]:
# Setup directories and DataLoaders
train_dir = os.path.abspath("Data/Data_converted/train")
val_dir = os.path.abspath("Data/Data_converted/valid")

train_loader = DataLoader(train_dir, batch_size=64, shuffle=True, max_per_class=150)
val_loader = DataLoader(val_dir, batch_size=64, shuffle=False, max_per_class=150)

In [4]:
def train_model_tune(config, train_loader, val_loader, num_epochs=10, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Instantiate the model dynamically
    model = OptimalCNN(num_classes=10).to(device)

    # Loss function & Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total_train = 0

        for batch in train_loader:
            inputs = batch.data.to(device)
            labels = batch.labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            train_correct += torch.sum(preds == labels).item()
            total_train += labels.size(0)

        avg_train_loss = train_loss / total_train
        train_acc = train_correct / total_train

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        total_val = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = batch.data.to(device)
                labels = batch.labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += torch.sum(preds == labels).item()
                total_val += labels.size(0)

        avg_val_loss = val_loss / total_val
        val_acc = val_correct / total_val

        # Log to Ray Tune
        tune.report({"accuracy": val_acc, "loss": avg_val_loss })

In [5]:
@ray.remote
def setup_worker():
    """Ensure each Ray worker has access to the dataset."""
    worker_data_path = os.path.join(os.getcwd(), "worker_data")

    if not os.path.exists(worker_data_path):
        os.makedirs(worker_data_path)

    # Copy dataset if it doesn’t already exist
    source_data_path = os.path.abspath("Data/Data_converted")
    if not os.path.exists(os.path.join(worker_data_path, "Data_converted")):
        shutil.copytree(source_data_path, os.path.join(worker_data_path, "Data_converted"))

    return f"Worker setup complete: {worker_data_path}"

In [6]:
config = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([2, 4, 8, 16, 32, 64, 128])
}
scheduler = ASHAScheduler(metric="accuracy",
                          mode="max",
                          max_t=10,
                          grace_period=2,
                          reduction_factor=2)

In [7]:
test_dir = os.path.abspath("Data/Data_converted/test")
test_loader = DataLoader(test_dir, batch_size=64, shuffle=False)

In [8]:
ray.init(ignore_reinit_error=True)
# Run this before training
ray.get(setup_worker.remote())
# Launch hyperparameter search
tuner = tune.run(
    tune.with_parameters(train_model_tune, train_loader=train_loader, val_loader=val_loader),
    config=config,
    num_samples=10,  # Number of trials
    scheduler=scheduler
)

# Get the best configuration
best_trial = tuner.get_best_trial("accuracy", mode="max", scope="all")
best_config = best_trial.config
print("Best hyperparameters:", best_config)

2025-03-22 19:29:59,313	INFO worker.py:1852 -- Started a local Ray instance.
2025-03-22 19:30:00,015	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-03-22 19:30:25
Running for:,00:00:25.38
Memory:,18.1/31.3 GiB

Trial name,status,loc,batch_size,lr
train_model_tune_a9bde_00000,RUNNING,192.168.1.43:88898,32,0.000260397
train_model_tune_a9bde_00001,RUNNING,192.168.1.43:88900,16,0.00391419
train_model_tune_a9bde_00002,RUNNING,192.168.1.43:88902,4,0.000175241
train_model_tune_a9bde_00003,RUNNING,192.168.1.43:88903,8,0.00105326
train_model_tune_a9bde_00004,RUNNING,192.168.1.43:88905,32,0.00104536
train_model_tune_a9bde_00005,RUNNING,192.168.1.43:88904,32,0.00657373
train_model_tune_a9bde_00006,RUNNING,192.168.1.43:88901,32,0.00765738
train_model_tune_a9bde_00007,RUNNING,192.168.1.43:88906,16,0.00684927
train_model_tune_a9bde_00008,RUNNING,192.168.1.43:88899,16,0.000137185
train_model_tune_a9bde_00009,RUNNING,192.168.1.43:88907,8,0.00200625


0,1
Current time:,2025-03-22 19:34:13
Running for:,00:04:13.60
Memory:,12.2/31.3 GiB

Trial name,status,loc,batch_size,lr,iter,total time (s),accuracy,loss
train_model_tune_a9bde_00000,TERMINATED,192.168.1.43:88898,32,0.000260397,10,247.957,0.506667,1.4842
train_model_tune_a9bde_00001,TERMINATED,192.168.1.43:88900,16,0.00391419,2,53.6121,0.265333,2.10645
train_model_tune_a9bde_00002,TERMINATED,192.168.1.43:88902,4,0.000175241,8,205.745,0.471333,1.54122
train_model_tune_a9bde_00003,TERMINATED,192.168.1.43:88903,8,0.00105326,10,250.899,0.486667,1.74761
train_model_tune_a9bde_00004,TERMINATED,192.168.1.43:88905,32,0.00104536,10,243.51,0.484,1.79236
train_model_tune_a9bde_00005,TERMINATED,192.168.1.43:88904,32,0.00657373,2,61.9092,0.28,2.4336
train_model_tune_a9bde_00006,TERMINATED,192.168.1.43:88901,32,0.00765738,2,65.701,0.276667,2.12934
train_model_tune_a9bde_00007,TERMINATED,192.168.1.43:88906,16,0.00684927,2,65.6468,0.262,2.15395
train_model_tune_a9bde_00008,TERMINATED,192.168.1.43:88899,16,0.000137185,4,115.861,0.440667,1.61387
train_model_tune_a9bde_00009,TERMINATED,192.168.1.43:88907,8,0.00200625,4,119.48,0.383333,1.81545


2025-03-22 19:34:13,639	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/piotr/ray_results/train_model_tune_2025-03-22_19-30-00' in 0.0041s.
2025-03-22 19:34:13,643	INFO tune.py:1041 -- Total run time: 253.63 seconds (253.60 seconds for the tuning loop).


Best hyperparameters: {'lr': 0.0002603965420459722, 'batch_size': 32}


In [9]:
best_model = OptimalCNN(num_classes=10)
train_model(best_model, train_loader, val_loader, num_epochs=10, lr=best_config["lr"])

Epoch 1/10 - Train loss: 2.0566, Train acc: 0.2653 | Val loss: 2.2685, Val acc: 0.2033
Epoch 2/10 - Train loss: 1.4425, Train acc: 0.5180 | Val loss: 1.6962, Val acc: 0.3873
Epoch 3/10 - Train loss: 1.0205, Train acc: 0.7033 | Val loss: 1.6205, Val acc: 0.4273
Epoch 4/10 - Train loss: 0.6930, Train acc: 0.8300 | Val loss: 1.5131, Val acc: 0.4633
Epoch 5/10 - Train loss: 0.3994, Train acc: 0.9527 | Val loss: 1.5211, Val acc: 0.4667
Epoch 6/10 - Train loss: 0.2119, Train acc: 0.9847 | Val loss: 1.4825, Val acc: 0.4820
Epoch 7/10 - Train loss: 0.1126, Train acc: 0.9967 | Val loss: 1.5343, Val acc: 0.4780
Epoch 8/10 - Train loss: 0.0632, Train acc: 0.9993 | Val loss: 1.4564, Val acc: 0.5053
Epoch 9/10 - Train loss: 0.0418, Train acc: 1.0000 | Val loss: 1.4863, Val acc: 0.5067
Epoch 10/10 - Train loss: 0.0288, Train acc: 1.0000 | Val loss: 1.4263, Val acc: 0.5133


In [10]:
evaluate(best_model, test_loader)

Test Loss: 1.9096, Test Accuracy: 0.3955


(1.9096140019628736, 0.3955111111111111)