In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas

import torch
import torch.optim as optim
from torch.nn import CrossEntropyLoss, Conv2d, Sequential, BatchNorm2d
from torch.utils.data import DataLoader

import torchvision
import torchvision.datasets as datasets
from torchvision import transforms

from torchinfo import summary

from ActiveShiftLayer import ASL
from util import test_loss, train_NN

from ray import tune
from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.search import ConcurrencyLimiter

In [2]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
print(device)

cuda


In [3]:
batch_size = 100

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

train_dataset = torchvision.datasets.CIFAR10(root='./data/CIFAR10', train=True,
                                        download=True, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

test_dataset = torchvision.datasets.CIFAR10(root='./data/CIFAR10', train=False,
                                       download=True, transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

num_labels = 10

Files already downloaded and verified
Files already downloaded and verified


In [4]:
from Models import LeNet
criterion = CrossEntropyLoss()
input_shape = (batch_size, 3, 32, 32)

def train_mnist(config, data = None):
    print(config)
    test_device = "cuda"
    model = LeNet(input_shape, 10, initial_lr=config["lr"], momentum=config["momentum"], weight_decay=0).to(test_device)
    train_NN(model, criterion, data,
    test_dataloader, epochs=4, batches_to_test=100,patience=2,device=test_device, print_test=False, verbose=False)
    acc = test_loss(model, test_dataloader, criterion, test_device)[1]
    tune.report(mean_accuracy=acc)

In [5]:
config = {"lr": tune.uniform(0.001, 0.1), "momentum": tune.uniform(0.5, 1)}

bayesopt = BayesOptSearch(metric="mean_accuracy", mode="max",
                          verbose=1, random_state=36, random_search_steps=10)
bayesopt = ConcurrencyLimiter(bayesopt, max_concurrent=1)

tuner = tune.Tuner(tune.with_resources(tune.with_parameters(train_mnist, data=train_dataloader), 
                {"gpu": 1}),  tune_config=tune.TuneConfig(search_alg=bayesopt, num_samples=30), param_space=config)

analysis = tuner.fit()

2022-08-31 01:34:42,769	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_544d74ca,TERMINATED,192.168.178.40:92676,0.0731222,0.800807,60.7,1,39.3578
train_mnist_6d3c8458,TERMINATED,192.168.178.40:93556,0.0952659,0.571587,58.71,1,47.0122
train_mnist_8ae4f5f8,TERMINATED,192.168.178.40:94443,0.0809286,0.681607,58.81,1,46.3837
train_mnist_a82693d8,TERMINATED,192.168.178.40:95335,0.0194691,0.629576,55.28,1,46.7675
train_mnist_c5950c88,TERMINATED,192.168.178.40:96230,0.0633465,0.51628,58.93,1,47.5955
train_mnist_e35746c8,TERMINATED,192.168.178.40:97147,0.094776,0.952159,51.0,1,47.7754
train_mnist_01372b40,TERMINATED,192.168.178.40:98037,0.0554629,0.598504,56.35,1,48.5605
train_mnist_1f81a5b2,TERMINATED,192.168.178.40:98945,0.0239713,0.501183,54.03,1,47.453
train_mnist_3d3c2bcc,TERMINATED,192.168.178.40:99787,0.0233795,0.687672,57.57,1,47.2905
train_mnist_5af06c96,TERMINATED,192.168.178.40:100606,0.0458216,0.62304,58.18,1,46.51




[2m[36m(train_mnist pid=92676)[0m {'lr': 0.07312221196797684, 'momentum': 0.8008071061851066}
Result for train_mnist_544d74ca:
  date: 2022-08-31_01-35-26
  done: false
  experiment_id: f437961348ba4a5a919b4d10992263a7
  hostname: max-Latitude-5401
  iterations_since_restore: 1
  mean_accuracy: 60.7
  node_ip: 192.168.178.40
  pid: 92676
  time_since_restore: 39.35782337188721
  time_this_iter_s: 39.35782337188721
  time_total_s: 39.35782337188721
  timestamp: 1661902526
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 544d74ca
  warmup_time: 0.0027205944061279297
  
Result for train_mnist_544d74ca:
  date: 2022-08-31_01-35-26
  done: true
  experiment_id: f437961348ba4a5a919b4d10992263a7
  experiment_tag: 1_lr=0.0731,momentum=0.8008
  hostname: max-Latitude-5401
  iterations_since_restore: 1
  mean_accuracy: 60.7
  node_ip: 192.168.178.40
  pid: 92676
  time_since_restore: 39.35782337188721
  time_this_iter_s: 39.35782337188721
  time_total_s: 39.35782337188721
  ti

2022-08-31 01:59:27,471	INFO tune.py:758 -- Total run time: 1482.88 seconds (1482.19 seconds for the tuning loop).


In [6]:
print(analysis.get_best_result(metric="mean_accuracy", mode="max"))
print(analysis.get_best_result(metric="mean_accuracy", mode="max").config)
analysis.get_dataframe()

Result(metrics={'mean_accuracy': 61.36, 'done': True, 'trial_id': 'a26a36aa', 'experiment_tag': '21_lr=0.0531,momentum=0.7540'}, error=None, log_dir=PosixPath('/home/max/ray_results/train_mnist_2022-08-31_01-34-39/train_mnist_a26a36aa_21_lr=0.0531,momentum=0.7540_2022-08-31_01-51-15'))
{'lr': 0.05309612987790999, 'momentum': 0.7539545167256022}


Unnamed: 0,mean_accuracy,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,timestamp,...,pid,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,warmup_time,config/lr,config/momentum,logdir
0,60.7,39.357823,False,,,1,544d74ca,f437961348ba4a5a919b4d10992263a7,2022-08-31_01-35-26,1661902526,...,92676,max-Latitude-5401,192.168.178.40,39.357823,0,1,0.002721,0.073122,0.800807,/home/max/ray_results/train_mnist_2022-08-31_0...
1,58.71,47.012231,False,,,1,6d3c8458,97ccec76a144431cbd225ec79c2da1a9,2022-08-31_01-36-16,1661902576,...,93556,max-Latitude-5401,192.168.178.40,47.012231,0,1,0.002213,0.095266,0.571587,/home/max/ray_results/train_mnist_2022-08-31_0...
2,58.81,46.383714,False,,,1,8ae4f5f8,523f6c1718194d528282be1751123ed5,2022-08-31_01-37-05,1661902625,...,94443,max-Latitude-5401,192.168.178.40,46.383714,0,1,0.002062,0.080929,0.681607,/home/max/ray_results/train_mnist_2022-08-31_0...
3,55.28,46.767542,False,,,1,a82693d8,a6061dd12ef54db8b3d27112ec6d157a,2022-08-31_01-37-55,1661902675,...,95335,max-Latitude-5401,192.168.178.40,46.767542,0,1,0.002105,0.019469,0.629576,/home/max/ray_results/train_mnist_2022-08-31_0...
4,58.93,47.595483,False,,,1,c5950c88,ebd18e8eb4204c8d9158fc00c4a1df93,2022-08-31_01-38-45,1661902725,...,96230,max-Latitude-5401,192.168.178.40,47.595483,0,1,0.002391,0.063346,0.51628,/home/max/ray_results/train_mnist_2022-08-31_0...
5,51.0,47.775352,False,,,1,e35746c8,93b2d3c9a105418ea2033ece289e0237,2022-08-31_01-39-35,1661902775,...,97147,max-Latitude-5401,192.168.178.40,47.775352,0,1,0.002074,0.094776,0.952159,/home/max/ray_results/train_mnist_2022-08-31_0...
6,56.35,48.560475,False,,,1,01372b40,ad40b2569f9f47e4bc17cdbf5410f437,2022-08-31_01-40-26,1661902826,...,98037,max-Latitude-5401,192.168.178.40,48.560475,0,1,0.00231,0.055463,0.598504,/home/max/ray_results/train_mnist_2022-08-31_0...
7,54.03,47.453028,False,,,1,1f81a5b2,94792e79caa14997abec6bba4470652d,2022-08-31_01-41-15,1661902875,...,98945,max-Latitude-5401,192.168.178.40,47.453028,0,1,0.00271,0.023971,0.501183,/home/max/ray_results/train_mnist_2022-08-31_0...
8,57.57,47.290463,False,,,1,3d3c2bcc,84af7e7993aa47c082059f48090a7029,2022-08-31_01-42-05,1661902925,...,99787,max-Latitude-5401,192.168.178.40,47.290463,0,1,0.00216,0.023379,0.687672,/home/max/ray_results/train_mnist_2022-08-31_0...
9,58.18,46.510027,False,,,1,5af06c96,77f0ecec617b43679139966ed3a3128f,2022-08-31_01-42-54,1661902974,...,100606,max-Latitude-5401,192.168.178.40,46.510027,0,1,0.002199,0.045822,0.62304,/home/max/ray_results/train_mnist_2022-08-31_0...


In [6]:
from Models import LeNet
#input_shape = (batch_size, 1, 28, 28)
criterion = CrossEntropyLoss()

def train_mnist(config):
    test_device = "cpu"
    model = LeNet(input_shape, 10, initial_lr=config["lr"], momentum=config["momentum"], weight_decay=config["weight_decay"]).to(test_device)
    for i in range(2):
        train_NN(model, criterion, train_dataloader,
        test_dataloader, epochs=2, batches_to_test=100,patience=2,device=test_device, print_test=False, verbose=False)
        acc = test_loss(model, test_dataloader, criterion, test_device)[1]
        tune.report(mean_accuracy=acc)

analysis = tune.run(
    train_mnist, config={"lr": tune.grid_search([0.001, 0.01, 0.1]), "momentum": tune.grid_search([0.9, 0.99]), "weight_decay": tune.grid_search([0, 0.01, 0.1])})

print("Best config: ", analysis.get_best_config(metric="mean_accuracy", mode="max",))

# Get a dataframe for analyzing trial results.
df = analysis.dataframe()

Trial name,status,loc,lr,momentum,weight_decay
train_mnist_ced3b_00001,RUNNING,,0.01,0.9,0.0
train_mnist_ced3b_00002,RUNNING,,0.1,0.9,0.0
train_mnist_ced3b_00003,RUNNING,,0.001,0.99,0.0
train_mnist_ced3b_00004,RUNNING,,0.01,0.99,0.0
train_mnist_ced3b_00005,RUNNING,,0.1,0.99,0.0
train_mnist_ced3b_00006,RUNNING,,0.001,0.9,0.01
train_mnist_ced3b_00007,RUNNING,,0.01,0.9,0.01
train_mnist_ced3b_00008,PENDING,,0.1,0.9,0.01
train_mnist_ced3b_00009,PENDING,,0.001,0.99,0.01
train_mnist_ced3b_00010,PENDING,,0.01,0.99,0.01

Trial name,# failures,error file
train_mnist_ced3b_00000,1,"/home/max/ray_results/train_mnist_2022-08-30_12-59-24/train_mnist_ced3b_00000_0_lr=0.0010,momentum=0.9000,weight_decay=0_2022-08-30_12-59-46/error.txt"


2022-08-30 12:59:47,080	ERROR ray_trial_executor.py:562 -- Trial train_mnist_ced3b_00000: Unexpected error starting runner.
Traceback (most recent call last):
  File "/home/max/anaconda3/lib/python3.9/site-packages/ray/tune/execution/ray_trial_executor.py", line 555, in start_trial
    return self._start_trial(trial)
  File "/home/max/anaconda3/lib/python3.9/site-packages/ray/tune/execution/ray_trial_executor.py", line 458, in _start_trial
    runner = self._setup_remote_runner(trial)
  File "/home/max/anaconda3/lib/python3.9/site-packages/ray/tune/execution/ray_trial_executor.py", line 399, in _setup_remote_runner
    return full_actor_class.remote(**kwargs)
  File "/home/max/anaconda3/lib/python3.9/site-packages/ray/actor.py", line 637, in remote
    return actor_cls._remote(args=args, kwargs=kwargs, **updated_options)
  File "/home/max/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 387, in _invocation_actor_class_remote_span
    return method(self, a

Best config:  None
