In [1]:
from copy import deepcopy
from time import time

import torch
import torch.nn.functional as F

from torch import nn
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd

from torchvision.datasets import FashionMNIST
from torchvision import transforms

from ray import tune



In [2]:
def acc(net_output, labels):
    predicted = net_output.argmax(dim=1)
    correct = (predicted == labels).sum()
    examples = len(labels)
    return (correct / examples).item()


class FMnistNet(nn.Module):
    def __init__(self, l1_size, l2_size):
        super().__init__()
        self.fc1 = nn.Linear(784, l1_size)
        self.fc2 = nn.Linear(l1_size, l2_size)
        self.fc3 = nn.Linear(l2_size, 10)
        
    def forward(self, x):
        x = x.reshape(-1, 28*28)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x
    
    
def training_function(config):
    lr, batch_size, l1_size, l2_size = config['lr'], config['batch_size'], config['l1_size'], config['l2_size']
    
    fmnist_transforms = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(0, 1)
        ])
    
    ds_train = FashionMNIST(
        root=FMNIST_DATA_DIR, 
        transform=fmnist_transforms,
        download=True, 
        train=True)

    ds_test = FashionMNIST(
        root=FMNIST_DATA_DIR, 
        transform=fmnist_transforms, 
        download=True,
        train=False)

    net = FMnistNet(l1_size, l2_size)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    
    train_set = DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=4)
    test_set = DataLoader(ds_test, batch_size=len(ds_train))

    
    for epoch in range(EPOCHS_PER_RUN):
        for data, labels in train_set:
            optimizer.zero_grad()
            net_output = net(data)
            loss = loss_fn(net_output, labels)
            loss.backward()
            optimizer.step()

    test_data, test_labels = next(iter(test_set))
    with torch.no_grad():
        test_output = net(test_data)
        test_acc = acc(test_output, test_labels)     
        tune.report(test_acc=test_acc)

In [3]:
EPOCHS_PER_RUN=5
FMNIST_DATA_DIR='~/Documents/lunar-landing/src/fmnist/data'

In [4]:
analysis = tune.run(
    training_function,
    config={
        "lr": tune.grid_search([0.001, 0.01, 0.1]),
        "batch_size": tune.grid_search([32, 64]),
        "l1_size": tune.grid_search([64, 128, 256]),
        "l2_size": tune.grid_search([64, 128, 256]),
    },
    local_dir='./fmnist_logs',
    verbose=False
)

2021-07-30 11:47:53,483	INFO services.py:1247 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=189973)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=189974)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=189973)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=189972)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=189974)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=189972)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=189975)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=189975)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=190752)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


[2m[36m(pid=190752)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=190750)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=190750)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=191076)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=191076)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=191210)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=191210)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=191684)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=191684)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag


[2m[36m(pid=191951)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=191951)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=192060)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=192060)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=192193)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=192193)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=192573)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


[2m[36m(pid=192573)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=192716)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=192716)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=192883)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=192883)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=193282)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=193282)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=193463)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


[2m[36m(pid=193463)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=193642)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=193642)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=193969)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=193970)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=193969)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=193970)[0m   allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[2m[36m(pid=193969)[0m 2021-07-30 12:05:03,498	ERROR worker.py:421 -- SystemExit was raised from the worker
[2m[36m(pid=193969)[0m Traceback (most recent call last):
[2m[36m(pid=193969)[0m   File "python/ray/_raylet.pyx", line 632, in ray._raylet.task_execution_handler
[2m[36m(pid=193969)[0m   File "pytho

[2m[36m(pid=193969)[0m Traceback (most recent call last):
[2m[36m(pid=193969)[0m   File "/usr/lib/python3.7/multiprocessing/resource_sharer.py", line 142, in _serve
[2m[36m(pid=193969)[0m     with self._listener.accept() as conn:
[2m[36m(pid=193969)[0m   File "/usr/lib/python3.7/multiprocessing/connection.py", line 455, in accept
[2m[36m(pid=193969)[0m     deliver_challenge(c, self._authkey)
[2m[36m(pid=193969)[0m   File "/usr/lib/python3.7/multiprocessing/connection.py", line 730, in deliver_challenge
[2m[36m(pid=193969)[0m     response = connection.recv_bytes(256)        # reject large message
[2m[36m(pid=193969)[0m   File "/usr/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
[2m[36m(pid=193969)[0m     buf = self._recv_bytes(maxlength)
[2m[36m(pid=193969)[0m   File "/usr/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
[2m[36m(pid=193969)[0m     buf = self._recv(4)
[2m[36m(pid=193969)[0m   File "/usr/lib/

In [5]:
print("Best config: ", analysis.get_best_config(
    metric="test_acc", mode="max")
     )

# Get a dataframe for analyzing trial results.
df = analysis.results_df

Best config:  {'lr': 0.1, 'batch_size': 32, 'l1_size': 128, 'l2_size': 64}


Traceback (most recent call last):
  File "/home/patryk/.local/share/virtualenvs/lunar-landing-q6dpKwgz/lib/python3.7/site-packages/ray/autoscaler/_private/monitor.py", line 317, in run
    self._run()
  File "/home/patryk/.local/share/virtualenvs/lunar-landing-q6dpKwgz/lib/python3.7/site-packages/ray/autoscaler/_private/monitor.py", line 207, in _run
    self.update_load_metrics()
  File "/home/patryk/.local/share/virtualenvs/lunar-landing-q6dpKwgz/lib/python3.7/site-packages/ray/autoscaler/_private/monitor.py", line 170, in update_load_metrics
    request, timeout=4)
  File "/home/patryk/.local/share/virtualenvs/lunar-landing-q6dpKwgz/lib/python3.7/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/home/patryk/.local/share/virtualenvs/lunar-landing-q6dpKwgz/lib/python3.7/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcE

In [6]:
tune.run?