In [1]:
from functools import partial
import os
import tempfile
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray import train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
import ray.cloudpickle as pickle
from ray.tune.syncer import SyncConfig
import shutil

from DataObjects import DataLoader
from Architectures.SimpleCNN import SimpleCNN
from Architectures.OptimalCNN import OptimalCNN
from Architectures.StochasticDepthCNN import StochasticDepthCNN

from typing import Optional, Tuple

from utils import save_model, load_model


In [2]:
def train_model(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader,
                num_epochs: int = 10, lr: float = 0.001,
                device: torch.device = None) -> None:
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    criterion: nn.Module = nn.CrossEntropyLoss()
    optimizer: torch.optim.Optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        train_loss: float = 0.0
        train_correct: int = 0
        total_train: int = 0

        for batch in train_loader:
            inputs = batch.data.to(device)
            labels = batch.labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            train_correct += torch.sum(preds == labels).item()
            total_train += labels.size(0)

        avg_train_loss = train_loss / total_train
        train_acc = train_correct / total_train

        model.eval()
        val_loss: float = 0.0
        val_correct: int = 0
        total_val: int = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = batch.data.to(device)
                labels = batch.labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += torch.sum(preds == labels).item()
                total_val += labels.size(0)

        avg_val_loss = val_loss / total_val
        val_acc = val_correct / total_val

        print(f"Epoch {epoch+1}/{num_epochs} - Train loss: {avg_train_loss:.4f}, Train acc: {train_acc:.4f} | Val loss: {avg_val_loss:.4f}, Val acc: {val_acc:.4f}")

def infer(model: nn.Module, data_loader: DataLoader,
          device: torch.device = None) -> list:
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    predictions: list = []

    with torch.no_grad():
        for batch in data_loader:
            inputs = batch.data.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            predictions.extend(preds.cpu().tolist())

    return predictions


def evaluate(model: nn.Module,
             test_loader: Optional[DataLoader] = None,
             device: Optional[torch.device] = None) -> Tuple[float, float]:

    if test_loader is None:
        test_dir = os.path.join("Data", "Data_converted", "test")
        test_loader = DataLoader(test_dir, batch_size=64, shuffle=True)

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    criterion = nn.CrossEntropyLoss()
    model.eval()

    test_loss = 0.0
    test_correct = 0
    total_test = 0

    with torch.no_grad():
        for batch in test_loader:
            inputs = batch.data.to(device)
            labels = batch.labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            test_correct += torch.sum(preds == labels).item()
            total_test += labels.size(0)

    avg_test_loss = test_loss / total_test
    test_acc = test_correct / total_test

    print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
    return avg_test_loss, test_acc

In [3]:
# Setup directories and DataLoaders
train_dir = os.path.abspath("Data/Data_converted/train")
val_dir = os.path.abspath("Data/Data_converted/valid")

train_loader = DataLoader(train_dir, batch_size=64, shuffle=True, max_per_class=150)
val_loader = DataLoader(val_dir, batch_size=64, shuffle=False, max_per_class=150)

In [4]:
def train_model_tune(config, train_loader, val_loader, num_epochs=10, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Instantiate the model dynamically
    model = SimpleCNN(num_classes=10).to(device)

    # Loss function & Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total_train = 0

        for batch in train_loader:
            inputs = batch.data.to(device)
            labels = batch.labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            train_correct += torch.sum(preds == labels).item()
            total_train += labels.size(0)

        avg_train_loss = train_loss / total_train
        train_acc = train_correct / total_train

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        total_val = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = batch.data.to(device)
                labels = batch.labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += torch.sum(preds == labels).item()
                total_val += labels.size(0)

        avg_val_loss = val_loss / total_val
        val_acc = val_correct / total_val

        # Log to Ray Tune
        tune.report({"accuracy": val_acc, "loss": avg_val_loss })

In [5]:
@ray.remote
def setup_worker():
    """Ensure each Ray worker has access to the dataset."""
    worker_data_path = os.path.join(os.getcwd(), "worker_data")

    if not os.path.exists(worker_data_path):
        os.makedirs(worker_data_path)

    # Copy dataset if it doesn’t already exist
    source_data_path = os.path.abspath("Data/Data_converted")
    if not os.path.exists(os.path.join(worker_data_path, "Data_converted")):
        shutil.copytree(source_data_path, os.path.join(worker_data_path, "Data_converted"))

    return f"Worker setup complete: {worker_data_path}"

In [6]:
config = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([2, 4, 8, 16, 32, 64, 128])
}
scheduler = ASHAScheduler(metric="accuracy",
                          mode="max",
                          max_t=10,
                          grace_period=2,
                          reduction_factor=2)

In [7]:
test_dir = os.path.abspath("Data/Data_converted/test")
test_loader = DataLoader(test_dir, batch_size=64, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
ray.init(ignore_reinit_error=True)
# Run this before training
ray.get(setup_worker.remote())
# Launch hyperparameter search
tuner = tune.run(
    tune.with_parameters(train_model_tune, train_loader=train_loader, val_loader=val_loader),
    config=config,
    num_samples=10,  # Number of trials
    scheduler=scheduler
)

# Get the best configuration
best_trial = tuner.get_best_trial("accuracy", mode="max", scope="all")
best_config = best_trial.config
print("Best hyperparameters:", best_config)

2025-03-22 18:53:35,420	INFO worker.py:1852 -- Started a local Ray instance.
2025-03-22 18:53:36,145	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-03-22 18:53:41
Running for:,00:00:05.21
Memory:,15.7/31.3 GiB

Trial name,status,loc,batch_size,lr
train_model_tune_940e0_00000,RUNNING,192.168.1.43:72280,64,0.00165554
train_model_tune_940e0_00001,RUNNING,192.168.1.43:72284,4,0.0638518
train_model_tune_940e0_00002,RUNNING,192.168.1.43:72281,32,0.0222563
train_model_tune_940e0_00003,RUNNING,192.168.1.43:72286,64,0.00027778
train_model_tune_940e0_00004,RUNNING,192.168.1.43:72285,4,0.000675632
train_model_tune_940e0_00005,RUNNING,192.168.1.43:72287,8,0.00105389
train_model_tune_940e0_00006,RUNNING,192.168.1.43:72283,2,0.000194524
train_model_tune_940e0_00007,RUNNING,192.168.1.43:72282,8,0.000317707
train_model_tune_940e0_00008,RUNNING,192.168.1.43:72289,4,0.000310958
train_model_tune_940e0_00009,RUNNING,192.168.1.43:72288,8,0.00079008


0,1
Current time:,2025-03-22 18:54:08
Running for:,00:00:32.15
Memory:,12.0/31.3 GiB

Trial name,status,loc,batch_size,lr,iter,total time (s),accuracy,loss
train_model_tune_940e0_00000,TERMINATED,192.168.1.43:72280,64,0.00165554,4,14.4199,0.254,2.24607
train_model_tune_940e0_00001,TERMINATED,192.168.1.43:72284,4,0.0638518,10,22.7939,0.1,2.303
train_model_tune_940e0_00002,TERMINATED,192.168.1.43:72281,32,0.0222563,2,7.73849,0.1,2.35757
train_model_tune_940e0_00003,TERMINATED,192.168.1.43:72286,64,0.00027778,4,13.5786,0.205333,2.17458
train_model_tune_940e0_00004,TERMINATED,192.168.1.43:72285,4,0.000675632,10,28.5338,0.321333,2.76285
train_model_tune_940e0_00005,TERMINATED,192.168.1.43:72287,8,0.00105389,8,24.3436,0.310667,2.8227
train_model_tune_940e0_00006,TERMINATED,192.168.1.43:72283,2,0.000194524,10,26.4257,0.359333,1.89345
train_model_tune_940e0_00007,TERMINATED,192.168.1.43:72282,8,0.000317707,10,29.315,0.368667,1.94307
train_model_tune_940e0_00008,TERMINATED,192.168.1.43:72289,4,0.000310958,2,8.35809,0.148,2.27066
train_model_tune_940e0_00009,TERMINATED,192.168.1.43:72288,8,0.00079008,2,7.75542,0.155333,2.27468


2025-03-22 18:54:08,312	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/piotr/ray_results/train_model_tune_2025-03-22_18-53-36' in 0.0041s.
2025-03-22 18:54:08,317	INFO tune.py:1041 -- Total run time: 32.17 seconds (32.14 seconds for the tuning loop).


Best hyperparameters: {'lr': 0.0003177066563316669, 'batch_size': 8}


In [9]:
best_model = SimpleCNN(num_classes=10).to(device)
train_model(best_model, train_loader, val_loader, num_epochs=10, lr=best_config["lr"])

Epoch 1/10 - Train loss: 8.1223, Train acc: 0.1113 | Val loss: 2.5757, Val acc: 0.1273
Epoch 2/10 - Train loss: 2.2834, Train acc: 0.1507 | Val loss: 2.1264, Val acc: 0.2187
Epoch 3/10 - Train loss: 2.0303, Train acc: 0.2573 | Val loss: 2.0488, Val acc: 0.2300
Epoch 4/10 - Train loss: 1.8522, Train acc: 0.3420 | Val loss: 1.9103, Val acc: 0.2993
Epoch 5/10 - Train loss: 1.7056, Train acc: 0.3860 | Val loss: 1.9201, Val acc: 0.3047
Epoch 6/10 - Train loss: 1.5467, Train acc: 0.4600 | Val loss: 1.9053, Val acc: 0.3053
Epoch 7/10 - Train loss: 1.4185, Train acc: 0.5187 | Val loss: 1.8441, Val acc: 0.3567
Epoch 8/10 - Train loss: 1.2454, Train acc: 0.5987 | Val loss: 1.8454, Val acc: 0.3480
Epoch 9/10 - Train loss: 1.1524, Train acc: 0.6180 | Val loss: 1.9262, Val acc: 0.3440
Epoch 10/10 - Train loss: 1.0402, Train acc: 0.6640 | Val loss: 1.8627, Val acc: 0.3553


In [10]:
evaluate(best_model, test_loader)

Test Loss: 2.2780, Test Accuracy: 0.3001


(2.2779569869995115, 0.30006666666666665)