In [3]:
import os
import numpy as np
import time
import argparse
import utils
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import ray
from ray import tune
from ray.tune import Trainable
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest import BayesOptSearch
from model import Vgg11, Resnet18, MobileNet, MobileNetV2
from torch.utils.data import DataLoader
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.autograd import Variable
np.set_printoptions(threshold=np.nan)
torch.cuda.manual_seed_all(50)

parser = argparse.ArgumentParser()
parser.add_argument('--Dataset_name', type=str, default='')
parser.add_argument('--Network_name', type=str, default='')

args, unparsed = parser.parse_known_args()

class HyperTrain(Trainable):

    def _get_dataset(self, name):

        normalize = transforms.Normalize(
            mean=[0.4914, 0.4822, 0.4465],
            std=[0.2023, 0.1994, 0.2010],
        )

        if name == 'FashionMNIST':

            data_transforms = transforms.Compose([
                transforms.Grayscale(num_output_channels=3),
                transforms.ToTensor(),
                normalize])
            dataset = torchvision.datasets.FashionMNIST(root="/home/willy-huang/workspace/data/FashionMNIST",
                                                        transform=data_transforms)
            num_classes = 10
            input_size = 512 * 1 * 1

            return dataset, num_classes, input_size

        elif name == 'CIFAR10':

            data_transforms = transforms.Compose([
                transforms.ToTensor(),
                normalize])
            dataset = torchvision.datasets.CIFAR10(root="/home/willy-huang/workspace/data/CIFAR10/",
                                                   transform=data_transforms)
            num_classes = 10
            input_size = 512 * 1 * 1

            return dataset, num_classes, input_size

        elif name == 'SVHN':

            data_transforms = transforms.Compose([
                transforms.ToTensor(),
                normalize])
            dataset = torchvision.datasets.SVHN(root="/home/willy-huang/workspace/data/SVHN/",
                                                transform=data_transforms)
            num_classes = 10
            input_size = 512 * 1 * 1

            return dataset, num_classes, input_size

        elif name == 'STL10':

            data_transforms = transforms.Compose([
                transforms.ToTensor(),
                normalize])
            dataset = torchvision.datasets.STL10(root="/home/willy-huang/workspace/data/STL10/",
                                                 transform=data_transforms)
            num_classes = 10
            input_size = 512 * 3 * 3

            return dataset, num_classes, input_size

        # elif name == 'Food':
        #
        #     class Food(Dataset):
        #
        #         def __init__(self, files, class_names, transform=transforms.ToTensor()):
        #
        #             self.data = files
        #             self.transform = transform
        #             self.class_names = class_names
        #
        #         def __getitem__(self, idx):
        #             img = Image.open(self.data[idx]).convert('RGB')
        #             name = self.data[idx].split('/')[-2]
        #             y = self.class_names.index(name)
        #             img = self.transform(img)
        #             return img, y
        #
        #         def __len__(self):
        #             return len(self.data)
        #
        #     data_transforms = transforms.Compose([
        #         transforms.RandomHorizontalFlip(),
        #         transforms.RandomVerticalFlip(),
        #         transforms.Resize((224, 224)),
        #         transforms.ToTensor(),
        #         normalize])
        #
        #     path = '/home/willy-huang/workspace/data/food'
        #     files_training = glob(os.path.join(path, '*/*.jpg'))
        #     class_names = []
        #
        #     for folder in os.listdir(os.path.join(path)):
        #         class_names.append(folder)
        #
        #     num_classes = len(class_names)
        #     dataset = Food(files_training, class_names, data_transforms)
        #     input_size = 512 * 7 * 7
        #
        #     return dataset, num_classes, input_size
        #
        # elif name == 'Stanford_dogs':
        #
        #     class Stanford_dogs(Dataset):
        #
        #         def __init__(self, files, class_names, transform=transforms.ToTensor()):
        #
        #             self.data = files
        #             self.transform = transform
        #             self.class_names = class_names
        #
        #         def __getitem__(self, idx):
        #             img = Image.open(self.data[idx]).convert('RGB')
        #             name = self.data[idx].split('/')[-2]
        #             y = self.class_names.index(name)
        #             img = self.transform(img)
        #             return img, y
        #
        #         def __len__(self):
        #             return len(self.data)
        #
        #
        #     data_transforms = transforms.Compose([
        #         transforms.RandomHorizontalFlip(),
        #         transforms.RandomVerticalFlip(),
        #         transforms.Resize((224, 224)),
        #         transforms.ToTensor(),
        #         normalize])
        #
        #     path = '/home/willy-huang/workspace/data/stanford_dogs'
        #     files_training = glob(os.path.join(path, '*/*.jpg'))
        #     class_names = []
        #
        #     for folder in os.listdir(os.path.join(path)):
        #         class_names.append(folder)
        #
        #     num_classes = len(class_names)
        #     dataset = Stanford_dogs(files_training, class_names, data_transforms)
        #     input_size = 512 * 7 * 7
        #
        #     return dataset, num_classes, input_size

    def _setup(self, config):
        self.start_time = time.time()
        self.name = args.Dataset_name
        nnArchitecture = args.Network_name

        dataset, num_class, input_size = self._get_dataset(self.name)

        num_total = len(dataset)
        shuffle = np.random.permutation(num_total)
        split_val = int(num_total * 0.2)

        train_idx, valid_idx = shuffle[split_val:], shuffle[:split_val]

        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetRandomSampler(valid_idx)

        self.trainset_ld = DataLoader(dataset, batch_size=128, sampler=train_sampler, num_workers=4)
        self.validset_ld = DataLoader(dataset, batch_size=128, sampler=valid_sampler, num_workers=4)

        self.modelname = '{}--{}.pth.tar'.format(self.name, nnArchitecture)
        loggername = self.modelname.replace("pth.tar", "log")
        self.logger = utils.buildLogger(loggername)

        self.seed_table = np.array(["","epoch","lr","momentum","weight_decay","factor","outLoss","accuracy"])

        # ---- hyperparameters ----
        self.lr = config["lr"]
        self.momentum = config["momentum"]
        self.weight_decay = config["weight_decay"]
        self.factor = config["factor"]

        self.epochID = 0
        self.loss = nn.CrossEntropyLoss()
        self.accuracy = -999999999999.0

        # -------------------- SETTINGS: NETWORK ARCHITECTURE

        if nnArchitecture == 'Vgg11':
            self.model = Vgg11(num_class, input_size).cuda()

        elif nnArchitecture == 'Resnet18':
            self.model = Resnet18(num_class, input_size).cuda()

        elif nnArchitecture == 'MobileNet':
            self.model = MobileNet(num_class, input_size).cuda()

        elif nnArchitecture == 'MobileNetV2':
            self.model = MobileNetV2(num_class, input_size).cuda()

        else:
            self.model = None
            assert 0

        self.model = torch.nn.DataParallel(self.model).cuda()
        self.logger.info("Build Model Done")

        # -------------------- SETTINGS: OPTIMIZER & SCHEDULER --------------------
        self.optimizer = optim.SGD(filter(lambda x: x.requires_grad, self.model.parameters()),
                                   lr=self.lr,
                                   momentum=self.momentum,
                                   weight_decay=self.weight_decay,
                                   nesterov=False)

        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                              factor=self.factor,
                                                              patience=10, mode='min')

        self.logger.info("Build Optimizer Done")

    def _train_iteration(self):

        self.model.train()

        for batchID, (input, target) in enumerate(self.trainset_ld):
            varInput = Variable(input).cuda(async=True)
            varTarget = Variable(target).cuda(async=True)
            varOutput = self.model(varInput)

            lossvalue = self.loss(varOutput, varTarget)
            self.optimizer.zero_grad()
            lossvalue.backward()
            self.optimizer.step()

    def _test(self):


        self.model.eval()

        lossVal = 0
        lossValNorm = 0
        correct = 0

        num_samples = 0
        for batchID, (input, target) in enumerate(self.validset_ld):
            with torch.no_grad():
                varInput = Variable(input).cuda(async=True)
                varTarget = Variable(target).cuda(async=True)
                varOutput = self.model(varInput)

                losstensor = self.loss(varOutput, varTarget)

                pred = varOutput.argmax(1)
                correct += (pred == varTarget).sum().cpu()

                lossVal += losstensor.item()
                lossValNorm += 1
                num_samples += len(input)

        self.outLoss = lossVal / lossValNorm
        accuracy = correct.item() / num_samples

        self.scheduler.step(self.outLoss, epoch=self.epochID)

        if accuracy > self.accuracy:
            self.accuracy = accuracy

            torch.save({'epoch': self.epochID + 1,
                        'state_dict': self.model.state_dict(),
                        'loss': self.outLoss,
                        'best_accuracy': self.accuracy,
                        'optimizer': self.optimizer.state_dict(),
                        }, "./best_"+self.modelname)

            save = np.array([self.seed_table,
                             [str(self.name), str(self.epochID+1), str(self.lr),
                              str(self.momentum), str(self.weight_decay), str(self.factor),
                              str(self.outLoss), str(self.accuracy)]])

            np.savetxt("./seed(50).csv", save, delimiter=',', fmt="%s")

        self.logger.info('Epoch [' + str(self.epochID + 1) + '] loss= {:.5f}'.format(self.outLoss) +
                         ' ---- accuracy= {:.5f}'.format(accuracy) +
                         ' ---- best_accuracy= {:.5f}'.format(self.accuracy) +
                         ' ---- model: {}'.format(self.modelname) +
                         ' ---- time: {:.1f} s'.format((time.time() - self.start_time)))

        self.epochID += 1

        return {"mean_loss": self.outLoss, "mean_accuracy": accuracy, "epoch": self.epochID}

    def _train(self):
        self._train_iteration()
        return self._test()

    def _save(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "final_model.pth")
        torch.save({
            "epoch": self.epochID,
            "best_accuracy": self.accuracy,
            'loss': self.outLoss,
            "state_dict": self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
        }, checkpoint_path)
        return checkpoint_path

    def _restore(self, checkpoint_path):
        self.model.load_state_dict(checkpoint_path)


if __name__ == "__main__":

    ray.init()

    sched = AsyncHyperBandScheduler(
        time_attr="training_iteration",
        reward_attr="neg_mean_loss",
        max_t=100,
        grace_period=5)

    space = {
        "lr": (0.001, 0.1),
        "momentum": (0.10001, 0.900001)
    }


    algo = BayesOptSearch(
        space,
        max_concurrent=4,
        reward_attr="neg_mean_loss",
        utility_kwargs={
            "kind": "ucb",
            "kappa": 2.5,
            "xi": 0.0
        }
    )
    exp_name = "{}.{}_result".format(args.Dataset_name, args.Network_name)
    tune.run_experiments(
        {
            exp_name: {
                "stop": {
                    # "mean_accuracy": 0.98,
                    # "training_iteration": 100
                    "epoch": 50
                },
                "resources_per_trial": {
                    "cpu": 8,
                    "gpu": 1
                },
                "run": HyperTrain,
                "checkpoint_at_end": True,
                # "num_samples": 20,
                "config": {
                    "lr": tune.grid_search([0.01, 0.1]),
                    "momentum": tune.grid_search([0.1, 0.9]),
                    "weight_decay": tune.grid_search([1e-4, 1e-6]),
                    "factor": tune.grid_search([0.1, 0.5])
                    # "lr": tune.sample_from(
                    #     lambda spec: np.random.uniform(0.001, 0.1)),
                    # "momentum": tune.sample_from(
                    #     lambda spec: np.random.uniform(0.1, 0.9))
                },
                "local_dir" : "/home/willy-huang/workspace/research/ray_results",
            }
        },
        verbose=1,
        # search_alg=algo,
        # scheduler=sched
    )

Exception: Perhaps you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.