In [100]:
ray.shutdown()


In [102]:
import argparse
import os
import tempfile
import time  # Import the time module

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from filelock import FileLock
from torchvision import datasets, transforms

import ray
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import AsyncHyperBandScheduler

# Change these values if you want the training to run quicker or slower.
EPOCH_SIZE = 6000
TEST_SIZE = 1000


class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
        self.fc = nn.Linear(192, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 3))
        x = x.view(-1, 192)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


def train_func(model, optimizer, train_loader, device=None):
    device = device or torch.device("cpu")
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if batch_idx * len(data) > EPOCH_SIZE:
            return
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()


def test_func(model, data_loader, device=None):
    device = device or torch.device("cpu")
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            if batch_idx * len(data) > TEST_SIZE:
                break
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    return correct / total


def get_data_loaders(batch_size=64):
    mnist_transforms = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    )

    # We add FileLock here because multiple workers will want to
    # download data, and this may cause overwrites since
    # DataLoader is not threadsafe.
    with FileLock(os.path.expanduser("~/data.lock")):
        train_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                "~/data", train=True, download=True, transform=mnist_transforms
            ),
            batch_size=batch_size,
            shuffle=True,
        )
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                "~/data", train=False, download=True, transform=mnist_transforms
            ),
            batch_size=batch_size,
            shuffle=True,
        )
    return train_loader, test_loader


def train_mnist(config):
    should_checkpoint = config.get("should_checkpoint", False)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    train_loader, test_loader = get_data_loaders()
    model = ConvNet().to(device)

    optimizer = optim.SGD(
        model.parameters(), lr=config["lr"], momentum=config["momentum"]
    )

    while True:
        train_func(model, optimizer, train_loader, device)
        acc = test_func(model, test_loader, device)
        metrics = {"mean_accuracy": acc}

        # Report metrics (and possibly a checkpoint)
        if should_checkpoint:
            with tempfile.TemporaryDirectory() as tempdir:
                torch.save(model.state_dict(), os.path.join(tempdir, "model.pt"))
                train.report(metrics, checkpoint=Checkpoint.from_directory(tempdir))
        else:
            train.report(metrics)


if __name__ == "__main__":
    start_time = time.time()  # Start measuring time

    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--cuda", action="store_true", default=False, help="Enables GPU training"
    )
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing"
    )
    args, _ = parser.parse_known_args()

    # Initialize Ray with 16 CPUs
    ray_init_start = time.time()  # Time for Ray initialization
    ray.init(num_cpus=16)  # Use all 16 CPUs
    ray_init_end = time.time()
    print(f"Ray initialization time: {ray_init_end - ray_init_start:.2f} seconds")

    # for early stopping
    sched = AsyncHyperBandScheduler()

    # Allocate resources for each trial
    resources_per_trial = {"cpu": 2, "gpu": int(args.cuda)}  # Allocate 2 CPUs per trial

    tuner_start = time.time()  # Time for tuning
    tuner = tune.Tuner(
        tune.with_resources(train_mnist, resources=resources_per_trial),
        tune_config=tune.TuneConfig(
            metric="mean_accuracy",
            mode="max",
            scheduler=sched,
            num_samples=50,  # Number of trials to run
        ),
        run_config=train.RunConfig(
            name="exp",
            stop={
                "mean_accuracy": 0.98,
                "training_iteration": 100,  # Train for 100 iterations
            },
        ),
        param_space={
            "lr": tune.loguniform(1e-4, 1e-2),
            "momentum": tune.uniform(0.1, 0.9),
        },
    )
    results = tuner.fit()
    tuner_end = time.time()
    print(f"Tuning time: {tuner_end - tuner_start:.2f} seconds")

    best_result = results.get_best_result()
    print("Best config is:", best_result.config)
    print("Best accuracy is:", best_result.metrics["mean_accuracy"])

    assert not results.errors

    total_time = time.time() - start_time  # Total script time
    print(f"Total execution time: {total_time:.2f} seconds")


0,1
Current time:,2025-01-09 13:53:13
Running for:,00:05:43.01
Memory:,17.1/31.9 GiB

Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_e33e1_00000,TERMINATED,127.0.0.1:38528,0.000792714,0.469882,0.196289,1,2.42563
train_mnist_e33e1_00001,TERMINATED,127.0.0.1:14064,0.00166862,0.155109,0.288086,1,2.2048
train_mnist_e33e1_00002,TERMINATED,127.0.0.1:24988,0.00219065,0.269254,0.791992,4,8.22459
train_mnist_e33e1_00003,TERMINATED,127.0.0.1:43528,0.000373667,0.561524,0.547852,16,31.4575
train_mnist_e33e1_00004,TERMINATED,127.0.0.1:18032,0.000204919,0.171918,0.129883,1,2.23506
train_mnist_e33e1_00005,TERMINATED,127.0.0.1:40156,0.00173261,0.231578,0.921875,100,197.447
train_mnist_e33e1_00006,TERMINATED,127.0.0.1:21384,0.00419464,0.136926,0.942383,100,198.187
train_mnist_e33e1_00007,TERMINATED,127.0.0.1:38140,0.00258761,0.624098,0.901367,16,31.9017
train_mnist_e33e1_00008,TERMINATED,127.0.0.1:33724,0.000236157,0.281247,0.0966797,1,2.27079
train_mnist_e33e1_00009,TERMINATED,127.0.0.1:38672,0.0017552,0.881146,0.972656,100,198.793


2025-01-09 13:53:13,026	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/froti/ray_results/exp' in 0.0552s.
2025-01-09 13:53:13,038	INFO tune.py:1041 -- Total run time: 343.04 seconds (342.95 seconds for the tuning loop).


Tuning time: 343.14 seconds
Best config is: {'lr': 0.0017551968239715395, 'momentum': 0.8811457906279782}
Best accuracy is: 0.97265625
Total execution time: 348.24 seconds


In [120]:
import argparse
import os
import tempfile
import time
import random
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from filelock import FileLock
from torchvision import datasets, transforms

# Change these values if you want the training to run quicker or slower.
EPOCH_SIZE = 6000
TEST_SIZE = 1000


class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
        self.fc = nn.Linear(192, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 3))
        x = x.view(-1, 192)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


def train_func(model, optimizer, train_loader, device=None):
    device = device or torch.device("cpu")
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if batch_idx * len(data) > EPOCH_SIZE:
            return
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()


def test_func(model, data_loader, device=None):
    device = device or torch.device("cpu")
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            if batch_idx * len(data) > TEST_SIZE:
                break
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    return correct / total


def get_data_loaders(batch_size=64):
    mnist_transforms = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    )

    # We add FileLock here because multiple workers will want to
    # download data, and this may cause overwrites since
    # DataLoader is not threadsafe.
    with FileLock(os.path.expanduser("~/data.lock")):
        train_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                "~/data", train=True, download=True, transform=mnist_transforms
            ),
            batch_size=batch_size,
            shuffle=True,
        )
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                "~/data", train=False, download=True, transform=mnist_transforms
            ),
            batch_size=batch_size,
            shuffle=True,
        )
    return train_loader, test_loader


def train_mnist(config):
    should_checkpoint = config.get("should_checkpoint", False)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    train_loader, test_loader = get_data_loaders()
    model = ConvNet().to(device)

    optimizer = optim.SGD(
        model.parameters(), lr=config["lr"], momentum=config["momentum"]
    )

    # Early stopping parameters
    patience = config.get("patience", 3)  # How many epochs to wait before stopping
    best_acc = 0.0  # Start with the best accuracy being 0
    epochs_without_improvement = 0
    epochs_run = 0  # Variable to track the number of epochs run

    # Training loop
    for epoch in range(config.get("epochs", 10)):  # Default to 10 epochs
        epochs_run += 1
        train_func(model, optimizer, train_loader, device)
        acc = test_func(model, test_loader, device)
        metrics = {"mean_accuracy": acc}
        print(f"Epoch {epoch + 1}, Accuracy: {acc:.4f}")

        # Early stopping check
        if acc > best_acc:
            best_acc = acc
            epochs_without_improvement = 0  # Reset counter if accuracy improves
        else:
            epochs_without_improvement += 1

        # If no improvement for 'patience' epochs, stop early
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {patience} epochs with no improvement.")
            break

        if should_checkpoint:
            with tempfile.TemporaryDirectory() as tempdir:
                torch.save(model.state_dict(), os.path.join(tempdir, "model.pt"))
                # Save checkpoint logic here (if needed, e.g., for Ray)
                # train.report(metrics, checkpoint=Checkpoint.from_directory(tempdir))
        else:
            # Report metrics
            pass

    # Save the actual number of epochs run for the trial
    config["epochs"] = epochs_run

    return metrics


def sample_hyperparameters(num_samples):
    """Samples hyperparameters from the same distributions as Ray."""
    lrs = np.exp(np.random.uniform(np.log(1e-4), np.log(1e-2), size=num_samples))  # log scale
    momentums = np.random.uniform(0.1, 0.9, size=num_samples)  # uniform
    return [{"lr": lr, "momentum": momentum} for lr, momentum in zip(lrs, momentums)]


if __name__ == "__main__":
    start_time = time.time()  # Start measuring time
    
    num_trials = 50
    best_accuracy = 0
    best_config = None

    # Sample 50 hyperparameter configurations
    hyperparameter_configs = sample_hyperparameters(num_samples=num_trials)

    for trial_idx, config in enumerate(hyperparameter_configs):
        print(f"Trial {trial_idx + 1}/{num_trials}")
        config["epochs"] = 100  # Number of epochs for each trial
        metrics = train_mnist(config)
        
        # Update the best accuracy and config if needed
        if metrics["mean_accuracy"] > best_accuracy:
            best_accuracy = metrics["mean_accuracy"]
            best_config = config

        # Print the number of epochs executed for the trial
        print(f"Trial {trial_idx + 1} ran for {config['epochs']} epochs.")

    print(f"\nBest configuration: {best_config}")
    print(f"Best accuracy: {best_accuracy:.4f}")

    total_time = time.time() - start_time  # Total script time
    print(f"Total execution time: {total_time:.2f} seconds")



Trial 1/50
Epoch 1, Accuracy: 0.8047
Epoch 2, Accuracy: 0.8809
Epoch 3, Accuracy: 0.8828
Epoch 4, Accuracy: 0.8857
Epoch 5, Accuracy: 0.8945
Epoch 6, Accuracy: 0.8984
Epoch 7, Accuracy: 0.9121
Epoch 8, Accuracy: 0.9170
Epoch 9, Accuracy: 0.9150
Epoch 10, Accuracy: 0.9219
Epoch 11, Accuracy: 0.9219
Epoch 12, Accuracy: 0.9268
Epoch 13, Accuracy: 0.9404
Epoch 14, Accuracy: 0.9238
Epoch 15, Accuracy: 0.9258
Epoch 16, Accuracy: 0.9365
Early stopping triggered after 3 epochs with no improvement.
Trial 1 ran for 16 epochs.
Trial 2/50
Epoch 1, Accuracy: 0.7812
Epoch 2, Accuracy: 0.8506
Epoch 3, Accuracy: 0.8633
Epoch 4, Accuracy: 0.8887
Epoch 5, Accuracy: 0.8789
Epoch 6, Accuracy: 0.9102
Epoch 7, Accuracy: 0.8994
Epoch 8, Accuracy: 0.9004
Epoch 9, Accuracy: 0.9121
Epoch 10, Accuracy: 0.9033
Epoch 11, Accuracy: 0.9160
Epoch 12, Accuracy: 0.9170
Epoch 13, Accuracy: 0.8955
Epoch 14, Accuracy: 0.9209
Epoch 15, Accuracy: 0.9307
Epoch 16, Accuracy: 0.9150
Epoch 17, Accuracy: 0.9346
Epoch 18, Accurac