# RayTune - Epochs, Learning Rate, Batch Size Tuning

## Imports

In [2]:
from functools import partial
import os
import random 

import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import random_split

from ray import tune
from ray.air import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.basic_variant import BasicVariantGenerator

In [3]:
torch.manual_seed(40)
random.seed(40)
np.random.seed(40)

## Run Tensorboard

In [4]:
%load_ext tensorboard
%tensorboard --logdir runs

# !tensorboard --logdir=runs

Reusing TensorBoard on port 6006 (pid 39724), started 1 day, 10:44:02 ago. (Use '!kill 39724' to kill it.)

## Load Fashion MNIST Dataset

In [5]:
def load_data(data_dir="./data"):
    # transforms
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))])

    # datasets
    trainset = torchvision.datasets.FashionMNIST(data_dir,
        download=True,
        train=True,
        transform=transform)
    testset = torchvision.datasets.FashionMNIST(data_dir,
        download=True,
        train=False,
        transform=transform)
    return trainset, testset

In [6]:
# Load the data outside the training function
data_dir = "./data"
trainset, testset = load_data(data_dir)

# Split the training set into subsets
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(trainset, [test_abs, len(trainset) - test_abs])

# Create data loaders for subsets
trainloader = torch.utils.data.DataLoader(
    train_subset, batch_size=64, shuffle=True, num_workers=2
)
valloader = torch.utils.data.DataLoader(
    val_subset, batch_size=64, shuffle=True, num_workers=2
)

## CNN Model Setup

In [7]:
class FashionCNN(nn.Module):
    def __init__(self):
        super(FashionCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = self.pool(nn.functional.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

## Training

### Defining training function

In [8]:
def train_fashion_mnist(config, trainloader, valloader):
    net = FashionCNN() 

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["learning_rate"])

    for epoch in range(config["epochs"]):
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_steps += 1

            if i % 2000 == 1999:
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps))
                running_loss = 0.0

        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        with torch.no_grad():
            for data in valloader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_steps += 1

        tune.report(
            mean_accuracy=correct / total,
            mean_val_loss=val_loss / val_steps
        )

### Config for hyperparameter tuning

In [9]:
# configuration for hyperparameter tuning 
config = {
    "epochs": tune.choice([5, 10, 15]),
    "learning_rate": tune.loguniform(1e-4, 1e-2), 
    "batch_size": tune.choice([16, 32, 64, 128])
}

### Defining basic experiment parameters

In [10]:
max_num_epochs = 15
num_samples = 20

### Defining ASHA scheduler

In [11]:
# scheduler for early stopping
scheduler = ASHAScheduler(
    metric="mean_val_loss",
    mode="min",
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2,
)

### Running Hyperparameter Tuning - Basic Version

In [8]:
# run hyperparameter tuning
result = tune.run(
    partial(train_fashion_mnist, trainloader=trainloader, valloader=valloader),
    resources_per_trial={"cpu": 8, "gpu": 0},
    config=config,
    num_samples=num_samples,
    storage_path='./tune_runs/',
    search_alg=BasicVariantGenerator(random_state=40))

# Get the best trial
best_trial = result.get_best_trial("mean_val_loss", mode="min")

# Get the best configuration and other relevant information
best_config = best_trial.config
best_metrics = best_trial.metric_analysis

print("Best trial config:", best_config)
print("Best trial metrics:", best_metrics)

2023-08-15 23:05:15,415	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-15 23:05:19,540	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2023-08-15 23:05:19,544	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-08-16 15:55:34
Running for:,16:50:14.91
Memory:,11.4/15.7 GiB

Trial name,status,loc,batch_size,epochs,learning_rate,acc,iter,total time (s),mean_val_loss
train_fashion_mnist_6bd66_00000,TERMINATED,127.0.0.1:9436,64,10,0.00243686,0.837833,10,693.586,0.453342
train_fashion_mnist_6bd66_00001,TERMINATED,127.0.0.1:9436,128,10,0.000131615,0.510083,10,886.625,2.17948
train_fashion_mnist_6bd66_00002,TERMINATED,127.0.0.1:9436,64,10,0.00699512,0.865917,10,31628.8,0.367655
train_fashion_mnist_6bd66_00003,TERMINATED,127.0.0.1:9436,32,15,0.000475096,0.7825,15,1606.65,0.605706
train_fashion_mnist_6bd66_00004,TERMINATED,127.0.0.1:9436,64,10,0.00289238,0.833667,10,1397.83,0.448237
train_fashion_mnist_6bd66_00005,TERMINATED,127.0.0.1:9436,16,15,0.000164914,0.6705,15,2196.61,1.16143
train_fashion_mnist_6bd66_00006,TERMINATED,127.0.0.1:9436,64,10,0.000121523,0.477167,10,1471.25,2.16276
train_fashion_mnist_6bd66_00007,TERMINATED,127.0.0.1:9436,16,10,0.000435055,0.756333,10,1376.24,0.697447
train_fashion_mnist_6bd66_00008,TERMINATED,127.0.0.1:9436,16,10,0.00356855,0.84175,10,1409.65,0.434199
train_fashion_mnist_6bd66_00009,TERMINATED,127.0.0.1:9436,128,5,0.000220585,0.563833,5,724.526,1.98157




Trial name,mean_accuracy,mean_val_loss
train_fashion_mnist_6bd66_00000,0.837833,0.453342
train_fashion_mnist_6bd66_00001,0.510083,2.17948
train_fashion_mnist_6bd66_00002,0.865917,0.367655
train_fashion_mnist_6bd66_00003,0.7825,0.605706
train_fashion_mnist_6bd66_00004,0.833667,0.448237
train_fashion_mnist_6bd66_00005,0.6705,1.16143
train_fashion_mnist_6bd66_00006,0.477167,2.16276
train_fashion_mnist_6bd66_00007,0.756333,0.697447
train_fashion_mnist_6bd66_00008,0.84175,0.434199
train_fashion_mnist_6bd66_00009,0.563833,1.98157


2023-08-16 15:55:34,724	INFO tune.py:1148 -- Total run time: 60615.18 seconds (60614.73 seconds for the tuning loop).


Best trial config: {'epochs': 10, 'learning_rate': 0.006995117888270908, 'batch_size': 64}
Best trial metrics: {'mean_accuracy': {'max': 0.8659166666666667, 'min': 0.7673333333333333, 'avg': 0.8328249999999999, 'last': 0.8659166666666667, 'last-5-avg': 0.8576333333333335, 'last-10-avg': 0.832825}, 'mean_val_loss': {'max': 0.6570790953775669, 'min': 0.367654533858629, 'avg': 0.46445752327587997, 'last': 0.367654533858629, 'last-5-avg': 0.3921157313471145, 'last-10-avg': 0.46445752327588}, 'time_this_iter_s': {'max': 30485.775593280792, 'min': 82.62707567214966, 'avg': 3162.8791247129443, 'last': 30485.775593280792, 'last-5-avg': 6165.350281667709, 'last-10-avg': 3162.879124712944}, 'done': {'max': False, 'min': False, 'avg': 0.0, 'last': False, 'last-5-avg': 0.0, 'last-10-avg': 0.0}, 'training_iteration': {'max': 10, 'min': 1, 'avg': 5.5, 'last': 10, 'last-5-avg': 8.0, 'last-10-avg': 5.5}, 'time_total_s': {'max': 31628.79124712944, 'min': 92.33721613883972, 'avg': 3847.8442382335666, 'l

### Training with 20 trials and ASHA scheduler

In [None]:
result2 = tune.run(
    partial(train_fashion_mnist, trainloader=trainloader, valloader=valloader),
    resources_per_trial={"cpu": 8, "gpu": 0},
    config=config,
    num_samples=num_samples,
    storage_path='./tune_runs/',
    search_alg=BasicVariantGenerator(random_state=40),
    scheduler=scheduler
)

best_trial = result2.get_best_trial("mean_val_loss", mode="min")
best_config = best_trial.config
best_metrics = best_trial.metric_analysis

print("Best trial config:", best_config)
print("Best trial metrics:", best_metrics)

### Training with 20 trials, ASHA scheduler and time stopper

In [11]:
result3 = tune.run(
    partial(train_fashion_mnist, trainloader=trainloader, valloader=valloader),
    resources_per_trial={"cpu": 8, "gpu": 0},
    config=config,
    num_samples=num_samples,
    storage_path='./tune_runs/',
    search_alg=BasicVariantGenerator(random_state=40),
    scheduler=scheduler,
    time_budget_s=21600 # Total time budget in seconds (6 hours)
)

best_trial = result3.get_best_trial("mean_val_loss", mode="min")
best_config = best_trial.config
best_metrics = best_trial.metric_analysis

print("Best trial config:", best_config)
print("Best trial metrics:", best_metrics)

2023-08-17 09:11:10,566	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-17 09:11:15,078	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2023-08-17 09:11:15,081	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-08-17 09:12:52
Running for:,00:01:36.65
Memory:,14.0/15.7 GiB

Trial name,status,loc,batch_size,epochs,learning_rate,acc,iter,total time (s),mean_val_loss
train_fashion_mnist_3b7e0_00000,RUNNING,127.0.0.1:41776,64,10,0.00243686,0.7015,1.0,65.9708,1.03978
train_fashion_mnist_3b7e0_00001,PENDING,,128,10,0.000131615,,,,
train_fashion_mnist_3b7e0_00002,PENDING,,64,10,0.00699512,,,,
train_fashion_mnist_3b7e0_00003,PENDING,,32,15,0.000475096,,,,
train_fashion_mnist_3b7e0_00004,PENDING,,64,10,0.00289238,,,,
train_fashion_mnist_3b7e0_00005,PENDING,,16,15,0.000164914,,,,
train_fashion_mnist_3b7e0_00006,PENDING,,64,10,0.000121523,,,,
train_fashion_mnist_3b7e0_00007,PENDING,,16,10,0.000435055,,,,
train_fashion_mnist_3b7e0_00008,PENDING,,16,10,0.00356855,,,,
train_fashion_mnist_3b7e0_00009,PENDING,,128,5,0.000220585,,,,




Trial name,mean_accuracy,mean_val_loss
train_fashion_mnist_3b7e0_00000,0.7015,1.03978


2023-08-17 09:13:02,159	INFO tune.py:1148 -- Total run time: 107.08 seconds (96.63 seconds for the tuning loop).
Resume experiment with: tune.run(..., resume=True)
- C:\Users\ibele\ray_results\train_fashion_mnist_2023-08-17_09-11-15\train_fashion_mnist_3b7e0_00001_1_batch_size=128,epochs=10,learning_rate=0.0001_2023-08-17_09-11-24
- C:\Users\ibele\ray_results\train_fashion_mnist_2023-08-17_09-11-15\train_fashion_mnist_3b7e0_00002_2_batch_size=64,epochs=10,learning_rate=0.0070_2023-08-17_09-11-24
- C:\Users\ibele\ray_results\train_fashion_mnist_2023-08-17_09-11-15\train_fashion_mnist_3b7e0_00003_3_batch_size=32,epochs=15,learning_rate=0.0005_2023-08-17_09-11-25
- C:\Users\ibele\ray_results\train_fashion_mnist_2023-08-17_09-11-15\train_fashion_mnist_3b7e0_00004_4_batch_size=64,epochs=10,learning_rate=0.0029_2023-08-17_09-11-25
- C:\Users\ibele\ray_results\train_fashion_mnist_2023-08-17_09-11-15\train_fashion_mnist_3b7e0_00005_5_batch_size=16,epochs=15,learning_rate=0.0002_2023-08-17_09-1

Best trial config: {'epochs': 10, 'learning_rate': 0.002436856264971207, 'batch_size': 64}
Best trial metrics: {'mean_accuracy': {'max': 0.7015, 'min': 0.7015, 'avg': 0.7015, 'last': 0.7015, 'last-5-avg': 0.7015, 'last-10-avg': 0.7015}, 'mean_val_loss': {'max': 1.039784792572894, 'min': 1.039784792572894, 'avg': 1.039784792572894, 'last': 1.039784792572894, 'last-5-avg': 1.039784792572894, 'last-10-avg': 1.039784792572894}, 'time_this_iter_s': {'max': 65.97083377838135, 'min': 65.97083377838135, 'avg': 65.97083377838135, 'last': 65.97083377838135, 'last-5-avg': 65.97083377838135, 'last-10-avg': 65.97083377838135}, 'done': {'max': False, 'min': False, 'avg': False, 'last': False, 'last-5-avg': False, 'last-10-avg': False}, 'training_iteration': {'max': 1, 'min': 1, 'avg': 1, 'last': 1, 'last-5-avg': 1, 'last-10-avg': 1}, 'time_total_s': {'max': 65.97083377838135, 'min': 65.97083377838135, 'avg': 65.97083377838135, 'last': 65.97083377838135, 'last-5-avg': 65.97083377838135, 'last-10-avg'

### Training with ASHA scheduler and time stopper

In [12]:
result4 = tune.run(
    partial(train_fashion_mnist, trainloader=trainloader, valloader=valloader),
    resources_per_trial={"cpu": 8, "gpu": 0},
    config=config,
    storage_path='./tune_runs/',
    search_alg=BasicVariantGenerator(random_state=40),
    scheduler=scheduler,
    time_budget_s=21600
)

best_trial = result4.get_best_trial("mean_val_loss", mode="min")
best_config = best_trial.config
best_metrics = best_trial.metric_analysis

print("Best trial config:", best_config)
print("Best trial metrics:", best_metrics)

2023-08-17 09:13:42,084	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-17 09:13:46,219	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2023-08-17 09:13:46,221	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-08-17 09:23:47
Running for:,00:10:01.43
Memory:,13.2/15.7 GiB

Trial name,status,loc,batch_size,epochs,learning_rate,acc,iter,total time (s),mean_val_loss
train_fashion_mnist_963da_00000,TERMINATED,127.0.0.1:40032,64,10,0.00243686,0.81425,7,514.925,0.512289




Trial name,mean_accuracy,mean_val_loss
train_fashion_mnist_963da_00000,0.81425,0.512289


2023-08-17 09:23:47,915	INFO timeout.py:54 -- Reached timeout of 600 seconds. Stopping all trials.
2023-08-17 09:23:51,447	INFO tune.py:1148 -- Total run time: 605.23 seconds (601.41 seconds for the tuning loop).


Best trial config: {'epochs': 10, 'learning_rate': 0.002436856264971207, 'batch_size': 64}
Best trial metrics: {'mean_accuracy': {'max': 0.81425, 'min': 0.7018333333333333, 'avg': 0.7789880952380953, 'last': 0.81425, 'last-5-avg': 0.7995666666666668, 'last-10-avg': 0.7789880952380954}, 'mean_val_loss': {'max': 1.00255689754131, 'min': 0.5122888692198916, 'avg': 0.6415388885104186, 'last': 0.5122888692198916, 'last-5-avg': 0.559422624079471, 'last-10-avg': 0.6415388885104186}, 'time_this_iter_s': {'max': 84.08389258384705, 'min': 61.38485527038574, 'avg': 73.56075232369558, 'last': 79.46110844612122, 'last-5-avg': 76.68750743865967, 'last-10-avg': 73.5607523236956}, 'done': {'max': False, 'min': False, 'avg': 0.0, 'last': False, 'last-5-avg': 0.0, 'last-10-avg': 0.0}, 'training_iteration': {'max': 7, 'min': 1, 'avg': 4.0, 'last': 7, 'last-5-avg': 5.0, 'last-10-avg': 4.0}, 'time_total_s': {'max': 514.9252662658691, 'min': 69.07238030433655, 'avg': 281.4378817762647, 'last': 514.925266265