# RayTune - Epochs, Learning Rate, Batch Size Tuning

### Imports

In [2]:
from functools import partial
import os
import random 

import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import random_split

from ray import tune
from ray.air import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.basic_variant import BasicVariantGenerator

In [3]:
torch.manual_seed(40)
random.seed(40)
np.random.seed(40)

### Run Tensorboard

In [4]:
%load_ext tensorboard
%tensorboard --logdir runs

# !tensorboard --logdir=runs

Launching TensorBoard...

### Load Fashion MNIST Dataset

In [5]:
def load_data(data_dir="./data"):
    # transforms
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))])

    # datasets
    trainset = torchvision.datasets.FashionMNIST(data_dir,
        download=True,
        train=True,
        transform=transform)
    testset = torchvision.datasets.FashionMNIST(data_dir,
        download=True,
        train=False,
        transform=transform)
    return trainset, testset

In [6]:
# Load the data outside the training function
data_dir = "./data"
trainset, testset = load_data(data_dir)

# Split the training set into subsets
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(trainset, [test_abs, len(trainset) - test_abs])

# Create data loaders for subsets
trainloader = torch.utils.data.DataLoader(
    train_subset, batch_size=64, shuffle=True, num_workers=2
)
valloader = torch.utils.data.DataLoader(
    val_subset, batch_size=64, shuffle=True, num_workers=2
)

### CNN Model Setup

In [7]:
class FashionCNN(nn.Module):
    def __init__(self):
        super(FashionCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = self.pool(nn.functional.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

### Training

In [8]:
def train_fashion_mnist(config, trainloader, valloader):
    net = FashionCNN() 

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["learning_rate"])

    for epoch in range(config["epochs"]):
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_steps += 1

            if i % 2000 == 1999:
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps))
                running_loss = 0.0

        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        with torch.no_grad():
            for data in valloader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_steps += 1

        tune.report(
            mean_accuracy=correct / total,
            mean_val_loss=val_loss / val_steps
        )

In [10]:
# configuration for hyperparameter tuning 
config = {
    "epochs": tune.choice([5, 10, 15]),
    "learning_rate": tune.loguniform(1e-4, 1e-2), 
    "batch_size": tune.choice([16, 32, 64, 128])
}

max_num_epochs = 15
num_samples = 2

# scheduler for early stopping
scheduler = ASHAScheduler(
    metric="mean_val_loss",
    mode="min",
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2,
)

# run hyperparameter tuning
result = tune.run(
    partial(train_fashion_mnist, trainloader=trainloader, valloader=valloader),
    resources_per_trial={"cpu": 8, "gpu": 0},
    config=config,
    num_samples=num_samples,
    storage_path='./tune_runs/',
    search_alg=BasicVariantGenerator(random_state=40))

# Get the best trial
best_trial = result.get_best_trial("mean_val_loss", mode="min")

# Get the best configuration and other relevant information
best_config = best_trial.config
best_metrics = best_trial.metric_analysis

print("Best trial config:", best_config)
print("Best trial metrics:", best_metrics)

2023-08-15 22:39:51,519	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-08-15 23:03:58
Running for:,00:24:06.50
Memory:,12.3/15.7 GiB

Trial name,status,loc,batch_size,epochs,learning_rate,acc,iter,total time (s),mean_val_loss
train_fashion_mnist_e22e4_00000,TERMINATED,127.0.0.1:33432,64,10,0.00243686,0.823333,10,616.425,0.481248
train_fashion_mnist_e22e4_00001,TERMINATED,127.0.0.1:33432,128,10,0.000131615,0.548417,10,822.289,1.85334




Trial name,mean_accuracy,mean_val_loss
train_fashion_mnist_e22e4_00000,0.823333,0.481248
train_fashion_mnist_e22e4_00001,0.548417,1.85334


2023-08-15 23:03:58,296	INFO tune.py:1148 -- Total run time: 1446.78 seconds (1446.48 seconds for the tuning loop).


Best trial config: {'epochs': 10, 'learning_rate': 0.002436856264971207, 'batch_size': 64}
Best trial metrics: {'mean_accuracy': {'max': 0.8233333333333334, 'min': 0.6785833333333333, 'avg': 0.7846083333333334, 'last': 0.8233333333333334, 'last-5-avg': 0.8134499999999999, 'last-10-avg': 0.7846083333333334}, 'mean_val_loss': {'max': 1.0534942457650571, 'min': 0.48124806535370807, 'avg': 0.6104780557149267, 'last': 0.48124806535370807, 'last-5-avg': 0.5106281901610659, 'last-10-avg': 0.6104780557149267}, 'time_this_iter_s': {'max': 67.97813439369202, 'min': 54.93714642524719, 'avg': 61.642497754096986, 'last': 65.33535933494568, 'last-5-avg': 63.13486022949219, 'last-10-avg': 61.642497754096986}, 'done': {'max': False, 'min': False, 'avg': 0.0, 'last': False, 'last-5-avg': 0.0, 'last-10-avg': 0.0}, 'training_iteration': {'max': 10, 'min': 1, 'avg': 5.5, 'last': 10, 'last-5-avg': 8.0, 'last-10-avg': 5.5}, 'time_total_s': {'max': 616.4249775409698, 'min': 59.11028170585632, 'avg': 335.6392