<a href="https://colab.research.google.com/github/Longcodedao/NAS-With-RL/blob/main/NAS_Total.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/AutoML/NAS/

[Errno 2] No such file or directory: '/content/drive/MyDrive/AutoML/NAS/'
/content


In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn

In [None]:
class Params:
    NUM_EPOCHS = 50
    ALPHA = 0.005
    BATCH_SIZE = 64
    HIDDEN_SIZE = 64    # Number of Hidden Units in Controller
    BETA = 0.1          # The entropy bonus multiplier
    INPUT_SIZE = 3
    ACTION_SPACE = 2
    NUM_STEPS = 4
    GAMMA = 0.99


In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5,), (0.5, ))]
)

trainset = torchvision.datasets.MNIST(root = './data', train = True,
                                      download = True, transform = transform)
testset = torchvision.datasets.MNIST(root = './data', train = False,
                                     download = True, transform = transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size = 64,
                                          shuffle = True, num_workers = 2)
testlaoder = torch.utils.data.DataLoader(testset, batch_size = 64,
                                         shuffle = False, num_workers = 2)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 143049319.76it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 37433774.36it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 58481877.58it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 7040106.71it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [None]:
class Controller(nn.Module):
    def __init__(self, search_space,
                hidden_size = 64, max_layer = 4, device = ''):

        super(Controller, self).__init__()

        self.search_space = search_space
        self.DEVICE = device
        self.hidden_size = hidden_size
        self.length_search = len(search_space)     # num_steps = max_layer * length_search_space
        self.list_length = [len(space) for space in search_space.values()]
        self.max_layer = max_layer

        self.lstm = nn.ModuleList()
        self.fc = nn.ModuleList()

        self.lstm.append(nn.LSTMCell(self.list_length[-1], self.hidden_size).to(self.DEVICE))

        for i in range(1, self.length_search):
            self.lstm.append(nn.LSTMCell(self.list_length[i - 1], self.hidden_size).to(self.DEVICE))


        for i in range(0, self.length_search):
            self.fc.append(nn.Linear(self.hidden_size, self.list_length[i]).to(self.DEVICE))

    def init_hidden(self):
        h_t = torch.zeros(1, self.hidden_size, dtype = torch.float, device = self.DEVICE)
        c_t = torch.zeros(1, self.hidden_size, dtype = torch.float, device = self.DEVICE)

        return (h_t, c_t)

    def forward(self, input):
        # self.total_layer = torch.randint(1, self.max_layer, (1,)).item()
        outputs = {}

        self.hidden = [self.init_hidden() for _ in range(self.length_search)]


        for num_layer in range(self.max_layer):

            for i, (key, val) in enumerate(self.search_space.items()):
                h_t, c_t = self.hidden[i]
                h_t, c_t = self.lstm[i](input, (h_t, c_t))
                self.hidden[i] = (h_t, c_t)
                output = self.fc[i](h_t)
                # print(output)
                input = output

                if key not in outputs.keys():
                  outputs[key] = [output]
                else:
                  outputs[key].extend([output])

        # print(outputs)`

        # for _ in range(self.length_search):
        #     h_t, c_t = self.hidden[i]
        #     h_t.detach_()
        #     c_t.detach_()
        #     self.hidden[i] = (h_t, c_t)

        for i, (key, val) in enumerate(outputs.items()):
            outputs[key] = torch.stack(outputs[key]).squeeze(1)

        return outputs


In [None]:
# 0: nn.ReLU, 1: nn.Tanh, 2: nn.Sigmoid

search_space = {
    "hidden_units": [8, 16, 32, 64],
    "activation": [0, 1, 2]
}


device = 'cuda' if torch.cuda.is_available() else 'cpu'

controller = Controller(search_space, max_layer = 4, device = device)
print(f"Total Layer: {controller.total_layer}")
print(f"List Length: {controller.list_length}")
print(controller)
input = torch.tensor([[1.0, 2.0, 3.0]]).to(device)
outputs = controller(input)
# print(outputs)

Total Layer: 2
List Length: [4, 3]
Controller(
  (lstm): ModuleList(
    (0): LSTMCell(3, 64)
    (1): LSTMCell(4, 64)
  )
  (fc): ModuleList(
    (0): Linear(in_features=64, out_features=4, bias=True)
    (1): Linear(in_features=64, out_features=3, bias=True)
  )
)


In [None]:
class NASModel(nn.Module):
    def __init__(self, architectures, input_size, output_size):
        super(NASModel, self).__init__()
        self.architectures = architectures
        self.length_layers = len(self.architectures['hidden_units'])
        self.output_size = output_size

        layers = []

        for layer in range(self.length_layers):
          hidden_units = self.architectures['hidden_units'][layer].item()
          activation = self.architectures['activation'][layer].item()
          # print(activation)

          if (activation == 0):
            activation = nn.ReLU()
          elif (activation == 1):
            activation = nn.Tanh()
          elif (activation == 2):
            activation = nn.Sigmoid()

          if layer == 0:
            layers.append(nn.Linear(input_size, hidden_units))
            layers.append(activation)

          else:
            layers.append(nn.Linear(self.architectures['hidden_units'][layer - 1].item(),
                                    hidden_units))
            layers.append(activation)

        layers.append(nn.Linear(self.architectures['hidden_units'][self.length_layers - 1].item(), self.output_size))
        layers.append(nn.Softmax(dim = 1))

        # print(layers)
        self.model = nn.Sequential(*layers)

    def forward(self, x):
      return self.model(x)

Define an architecture

In [None]:
from torch.distributions import Categorical
from torch.nn.functional import one_hot, log_softmax, softmax, normalize

architecture = {}
episode_total_log_probs = {}
controller = Controller(search_space, max_layer = 4, device = device)
episode_logits = controller(input)

print(f"Number of layers is: {controller.total_layer}")
for key, space in search_space.items():
    logits = episode_logits[key]

    action_index = Categorical(logits = logits).sample().unsqueeze(1)
    # print(action_index)
    actions_space = torch.tensor([space] * controller.total_layer).to(device)
    action = torch.gather(actions_space, 1, action_index).to(device)
    architecture[key] = action.squeeze(1)

    # print(action_index.int().squeeze(1))

    mask = one_hot(action_index, num_classes = len(space))
    episode_log_probs = torch.sum(mask.float() * log_softmax(logits, dim = 1), dim = 1)
    episode_total_log_probs[key] = episode_log_probs


print(architecture)

print(episode_total_log_probs)

Number of layers is: 2
{'hidden_units': tensor([16, 16], device='cuda:0'), 'activation': tensor([1, 2], device='cuda:0')}
{'hidden_units': tensor([[ 0.0000, -2.5870,  0.0000,  0.0000],
        [ 0.0000, -2.5870,  0.0000,  0.0000]], device='cuda:0',
       grad_fn=<SumBackward1>), 'activation': tensor([[ 0.0000, -2.3228,  0.0000],
        [ 0.0000,  0.0000, -2.1969]], device='cuda:0', grad_fn=<SumBackward1>)}


In [None]:
model = NASModel(architecture, 784, 10)
print(model)

NASModel(
  (model): Sequential(
    (0): Linear(in_features=784, out_features=16, bias=True)
    (1): Tanh()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=16, out_features=10, bias=True)
    (5): Softmax(dim=1)
  )
)


In [None]:
from torch.distributions import Categorical
from torch.nn.functional import one_hot, log_softmax, softmax, normalize
import torch.optim as optim
import tqdm

def play_episode(controller):
    architecture = {}
    episode_total_log_probs = {}

    input = torch.tensor([[1.0, 2.0, 3.0]]).to(device)

    # print(controller)
    episode_logits = controller(input)


    for key, space in search_space.items():
        logits = episode_logits[key]

        action_index = Categorical(logits = logits).sample().unsqueeze(1)
        actions_space = torch.tensor([space] * controller.total_layer).to(device)
        action = torch.gather(actions_space, 1, action_index).to(device)
        architecture[key] = action.squeeze(1)

        # print(action_index.int().squeeze(1))

        mask = one_hot(action_index, num_classes = len(space))
        episode_log_probs = torch.sum(mask.float() * log_softmax(logits, dim = 1), dim = 1)
        episode_total_log_probs[key] = episode_log_probs

    model = NASModel(architecture, 784, 10).to(device)
    print(f'{model}\n')

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr = 0.005, momentum = 0.9)

    for epoch in range(10):
        model.train()
        running_loss = 0.0
        for i, data in enumerate(trainloader):

            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            inputs = inputs.view(-1, 784)

            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

        running_loss /= len(trainloader)
        print(f"Epoch {epoch + 1}: Loss = {running_loss}")

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testlaoder:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images.view(-1, 784))
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    acc = 100 * correct / total
    print('Accuracy of the network on the 10000 test images: {}'.format(acc))

    # compute the reward
    reward = acc

    reward = torch.tensor(reward, device=device).detach()

    sum_weighted_log_probs = {}



    sum_weighted_log_probs['hidden_units'] = torch.sum(-episode_total_log_probs['hidden_units'] * reward).unsqueeze(0)
    sum_weighted_log_probs['activation'] = torch.sum(-episode_total_log_probs['activation'] * reward).unsqueeze(0)

    sum_weighted_loss = sum_weighted_log_probs['hidden_units'] + \
                        sum_weighted_log_probs['activation']

    return sum_weighted_loss, episode_total_log_probs, reward

In [None]:

controller = Controller(search_space, max_layer = 4, device = device)
print(controller)
optimizer = optim.Adam(controller.parameters(), lr = 0.001)
total_rewards = []

controller.train()
for epoch in range(10):

  optimizer.zero_grad()
  epoch_log_probs = torch.empty((0,), device = device)

  for i in range(3):
    (sum_weighted_loss, episode_logits,
        reward) = play_episode(controller)
    print(sum_weighted_loss)
    epoch_log_probs = torch.cat((epoch_log_probs, sum_weighted_loss))

  loss = torch.mean(epoch_log_probs)

  loss.backward()
  optimizer.step()

  # for name, param in controller.named_parameters():
  #   print(name, param.grad)

  print(f"Loss in {epoch} is: {loss}")



Controller(
  (lstm): ModuleList(
    (0): LSTMCell(3, 64)
    (1): LSTMCell(4, 64)
  )
  (fc): ModuleList(
    (0): Linear(in_features=64, out_features=4, bias=True)
    (1): Linear(in_features=64, out_features=3, bias=True)
  )
)
NASModel(
  (model): Sequential(
    (0): Linear(in_features=784, out_features=32, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=32, out_features=10, bias=True)
    (3): Softmax(dim=1)
  )
)

Epoch 1: Loss = 2.246454812316244
Epoch 2: Loss = 2.0350197571427073
Epoch 3: Loss = 1.8988345485252103
Epoch 4: Loss = 1.8306730727651226
Epoch 5: Loss = 1.8100813667911457
Epoch 6: Loss = 1.8001896073060757
Epoch 7: Loss = 1.7934741138903572
Epoch 8: Loss = 1.7858665119102006
Epoch 9: Loss = 1.7576661290390405
Epoch 10: Loss = 1.7105653771459421
Accuracy of the network on the 10000 test images: 87.82
tensor([211.0522], device='cuda:0', grad_fn=<AddBackward0>)
NASModel(
  (model): Sequential(
    (0): Linear(in_features=784, out_features=32, bias=True)
    

KeyboardInterrupt: ignored

In [None]:
input = torch.tensor([[1.0, 2.0, 3.0]]).to(device)
controller(input)

episode_logits = controller(input)
architecture = {}
episode_total_log_probs = {}

for key, space in search_space.items():
    logits = episode_logits[key]

    action_index = Categorical(logits = logits).sample().unsqueeze(1)
    actions_space = torch.tensor([space] * controller.total_layer).to(device)
    action = torch.gather(actions_space, 1, action_index).to(device)
    architecture[key] = action.squeeze(1)

    # print(action_index.int().squeeze(1))

    mask = one_hot(action_index, num_classes = len(space))
    episode_log_probs = torch.sum(mask.float() * log_softmax(logits, dim = 1), dim = 1)
    episode_total_log_probs[key] = episode_log_probs
architecture

{'hidden_units': tensor([64,  8], device='cuda:0'),
 'activation': tensor([1, 0], device='cuda:0')}

In [None]:
model = NASModel(architecture, 784, 10).to(device)
print(f'{model}\n')

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.005, momentum = 0.9)

for epoch in range(10):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(trainloader):

        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        inputs = inputs.view(-1, 784)

        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

    running_loss /= len(trainloader)
    print(f"Epoch {epoch + 1}: Loss = {running_loss}")

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in testlaoder:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        outputs = model(images.view(-1, 784))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

acc = 100 * correct / total
print('Accuracy of the network on the 10000 test images: {}'.format(acc))

NASModel(
  (model): Sequential(
    (0): Linear(in_features=784, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=8, bias=True)
    (3): ReLU()
    (4): Linear(in_features=8, out_features=10, bias=True)
    (5): Softmax(dim=1)
  )
)

Epoch 1: Loss = 2.1128573410038247
Epoch 2: Loss = 1.7757354270674781
Epoch 3: Loss = 1.7230752162587668
Epoch 4: Loss = 1.7097431104828809
Epoch 5: Loss = 1.7022325784158605
Epoch 6: Loss = 1.6970807165225177
Epoch 7: Loss = 1.6929669145073718
Epoch 8: Loss = 1.6894590497525261
Epoch 9: Loss = 1.6868822404316492
Epoch 10: Loss = 1.6845199286556447
Accuracy of the network on the 10000 test images: 77.66
