# RayTune - Epochs, Learning Rate, Batch Size Tuning

### Imports

In [1]:
from functools import partial
import os
import random 

import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import random_split

from ray import tune
from ray.air import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.basic_variant import BasicVariantGenerator

In [2]:
torch.manual_seed(40)
random.seed(40)
np.random.seed(40)

### Run Tensorboard

In [3]:
%load_ext tensorboard
%tensorboard --logdir runs

# !tensorboard --logdir=runs

Reusing TensorBoard on port 6006 (pid 39724), started 0:35:52 ago. (Use '!kill 39724' to kill it.)

### Load Fashion MNIST Dataset

In [4]:
def load_data(data_dir="./data"):
    # transforms
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))])

    # datasets
    trainset = torchvision.datasets.FashionMNIST(data_dir,
        download=True,
        train=True,
        transform=transform)
    testset = torchvision.datasets.FashionMNIST(data_dir,
        download=True,
        train=False,
        transform=transform)
    return trainset, testset

In [5]:
# Load the data outside the training function
data_dir = "./data"
trainset, testset = load_data(data_dir)

# Split the training set into subsets
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(trainset, [test_abs, len(trainset) - test_abs])

# Create data loaders for subsets
trainloader = torch.utils.data.DataLoader(
    train_subset, batch_size=64, shuffle=True, num_workers=2
)
valloader = torch.utils.data.DataLoader(
    val_subset, batch_size=64, shuffle=True, num_workers=2
)

### CNN Model Setup

In [6]:
class FashionCNN(nn.Module):
    def __init__(self):
        super(FashionCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = self.pool(nn.functional.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

### Training

In [7]:
def train_fashion_mnist(config, trainloader, valloader):
    net = FashionCNN() 

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["learning_rate"])

    for epoch in range(config["epochs"]):
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_steps += 1

            if i % 2000 == 1999:
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps))
                running_loss = 0.0

        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        with torch.no_grad():
            for data in valloader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_steps += 1

        tune.report(
            mean_accuracy=correct / total,
            mean_val_loss=val_loss / val_steps
        )

In [8]:
# configuration for hyperparameter tuning 
config = {
    "epochs": tune.choice([5, 10, 15]),
    "learning_rate": tune.loguniform(1e-4, 1e-2), 
    "batch_size": tune.choice([16, 32, 64, 128])
}

max_num_epochs = 15
num_samples = 20

# scheduler for early stopping
scheduler = ASHAScheduler(
    metric="mean_val_loss",
    mode="min",
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2,
)

# run hyperparameter tuning
result = tune.run(
    partial(train_fashion_mnist, trainloader=trainloader, valloader=valloader),
    resources_per_trial={"cpu": 8, "gpu": 0},
    config=config,
    num_samples=num_samples,
    storage_path='./tune_runs/',
    search_alg=BasicVariantGenerator(random_state=40))

# Get the best trial
best_trial = result.get_best_trial("mean_val_loss", mode="min")

# Get the best configuration and other relevant information
best_config = best_trial.config
best_metrics = best_trial.metric_analysis

print("Best trial config:", best_config)
print("Best trial metrics:", best_metrics)

2023-08-15 23:05:15,415	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-15 23:05:19,540	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2023-08-15 23:05:19,544	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-08-16 08:25:18
Running for:,09:19:58.99
Memory:,12.3/15.7 GiB

Trial name,status,loc,batch_size,epochs,learning_rate,acc,iter,total time (s),mean_val_loss
train_fashion_mnist_6bd66_00003,RUNNING,127.0.0.1:9436,32,15,0.000475096,0.541167,3.0,298.583,1.62951
train_fashion_mnist_6bd66_00004,PENDING,,64,10,0.00289238,,,,
train_fashion_mnist_6bd66_00005,PENDING,,16,15,0.000164914,,,,
train_fashion_mnist_6bd66_00006,PENDING,,64,10,0.000121523,,,,
train_fashion_mnist_6bd66_00007,PENDING,,16,10,0.000435055,,,,
train_fashion_mnist_6bd66_00008,PENDING,,16,10,0.00356855,,,,
train_fashion_mnist_6bd66_00009,PENDING,,128,5,0.000220585,,,,
train_fashion_mnist_6bd66_00010,PENDING,,16,15,0.00054965,,,,
train_fashion_mnist_6bd66_00011,PENDING,,64,10,0.00121543,,,,
train_fashion_mnist_6bd66_00012,PENDING,,32,15,0.000729985,,,,




Trial name,mean_accuracy,mean_val_loss
train_fashion_mnist_6bd66_00000,0.837833,0.453342
train_fashion_mnist_6bd66_00001,0.510083,2.17948
train_fashion_mnist_6bd66_00002,0.865917,0.367655
train_fashion_mnist_6bd66_00003,0.541167,1.62951
