In [2]:
# imports
import os
import json
import re

import torch 
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import numpy as np

from MDP import MDP

In [3]:
# check torch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3080'

In [4]:
#load MDP
mdp = MDP(dir = "data_medium", type = "val", name = "100595")
# show grid
mdp.print_grid()
for a in ["move", "turnRight", "pickMarker", "move"]:
    mdp.get_next_state(a)
mdp.print_grid()

[[['v' '.' '.' '#']
  ['.' '#' '#' '#']
  ['.' '#' '.' '#']
  ['#' '#' '#' '#']]

 [['.' '.' '.' '#']
  ['O' '#' '#' '#']
  ['v' '#' '.' '#']
  ['#' '#' '#' '#']]]
[[['.' '.' '.' '#']
  ['<' '#' '#' '#']
  ['.' '#' '.' '#']
  ['#' '#' '#' '#']]

 [['.' '.' '.' '#']
  ['O' '#' '#' '#']
  ['v' '#' '.' '#']
  ['#' '#' '#' '#']]]


## Initialize Model with Imitation learning

In [5]:
# data tests
os.listdir("datasets/data/train")

['seq', 'task']

In [6]:
# create dataset

#numerical representation of actions
getNumAction = {
    "move" : 0,
    "turnRight": 1,
    "turnLeft" : 2,
    "pickMarker": 3,
    "putMarker" : 4,
    "finish" : 5
}

class Dataset(data.Dataset):
    """
    attributes:
    dir : str list :=  accepted directories (data, data_easy, data_medium)
    type : str list := train and/or val
    grid : tensor := a tensor of all available grids
    actions : tensor := vector of the optimal action for each
    """
    def data_generator(self):
        for dir in self.dir:
            for type in self.type:
                for i in os.listdir(os.sep.join(["datasets", dir, type, "task"]))[:-4]:
                    i = re.sub(r"\D", "", i)
                    # load MDP and optimal sequence
                    currMDP = MDP(dir = dir, type = type, name = str(i))

                    with open(os.sep.join(["datasets", dir, type, "seq", str(i) + "_seq.json"])) as seq:
                        sequence = json.load(seq)["sequence"]
                    
                    for action in sequence:
                        yield currMDP.get_current_state().copy(), action
                        currMDP.get_next_state(action)


    def __init__(self, dir = ["data", "data_easy", "data_medium"], type = ["train"]) -> None:
        """
        dir : str list :=  accepted directories (data, data_easy, data_medium)
        type : str list := train and/or val
        """
        self.dir = dir
        self.type = type
        lstActionsAndGrids = list(self.data_generator())
        self.grid = torch.tensor(np.array([x[0] for x in lstActionsAndGrids]) / 10)
        self.actions = torch.tensor(np.array([getNumAction[x[1]] for x in lstActionsAndGrids]))
    
    def __len__(self):
        return len(self.grid)

    def __getitem__(self, idx):
        return self.grid[idx], self.actions[idx]

trainDataset = Dataset()
valDataset = Dataset(type = ["val"])

In [70]:
testGrid = trainDataset[0][0]

tensor([[[1.0000, 0.0000, 0.0000, 1.0000],
         [1.0000, 0.0000, 1.0000, 0.0000],
         [0.0000, 0.4000, 0.9000, 0.0000],
         [0.0000, 0.0000, 1.0000, 0.0000]],

        [[1.0000, 0.0000, 0.0000, 1.0000],
         [1.0000, 0.4000, 1.0000, 0.0000],
         [0.0000, 0.0000, 0.9000, 0.0000],
         [0.0000, 0.0000, 1.0000, 0.0000]]], dtype=torch.float64)


tensor([[1.0000, 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.4000, 0.0000],
        [0.0000, 1.0000, 0.9000, 1.0000],
        [1.0000, 0.0000, 0.0000, 0.0000]], dtype=torch.float64)

In [97]:
# create Neural Network

class Net(nn.Module):
    """
    input : 2 X 4 X 4 grid
    label : Move [0;6]
    """
    def __init__(self):
        super(Net, self).__init__()
        # first layer: input
        self.conv1 = nn.Conv2d(2, 8, 2)

        #second layer : 2nd convolution
        self.conv2 = nn.Conv2d(8, 16, 2)

        self.conv3 = nn.Conv2d(16, 32, 2)

        self.fc1 = nn.Linear(32, 16)

        self.out = nn.Linear(16, 6)


    def forward(self, x):
        x = x.float()

        x = F.relu(self.conv1(x))

        x = F.relu(self.conv2(x))

        x = F.relu(self.conv3(x))

        x = torch.flatten(x,start_dim=1)

        x = F.relu(self.fc1(x))

        x = self.out(x)
    
        return x    

In [98]:
#creating model
net = Net()
print(net)

params = list(net.parameters())
print(f"number of parameters: {len(params)}")

#loss function
loss = nn.CrossEntropyLoss()

#optimizer
optimizer = torch.optim.Adam(net.parameters())
optimizer

Net(
  (conv1): Conv2d(2, 8, kernel_size=(2, 2), stride=(1, 1))
  (conv2): Conv2d(8, 16, kernel_size=(2, 2), stride=(1, 1))
  (conv3): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1))
  (fc1): Linear(in_features=32, out_features=16, bias=True)
  (out): Linear(in_features=16, out_features=6, bias=True)
)
number of parameters: 10


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [99]:
BATCH_SIZE = 16
dataloader = data.DataLoader(trainDataset, BATCH_SIZE)
validationLoader = data.DataLoader(valDataset, BATCH_SIZE)

def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0
    last_loss = 0

    for i, data in enumerate(dataloader):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = net(inputs)

        lossVal = loss(outputs, labels)
        lossVal.backward()

        optimizer.step()

        running_loss += lossVal.item()

        if i % 1000 == 999:
            last_loss = running_loss / 1000
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(dataloader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [100]:
timestamp = datetime.now().strftime("%d%m%Y_%H%M%S")
writer = SummaryWriter("runs/imitation_learning_{}".format(timestamp))
epoch_num = 0

torch.manual_seed(1)

EPOCHS = 15

for epoch in range(EPOCHS):
    print("EPOCH {}".format(epoch_num + 1))

    net.train(True)
    avg_loss = train_one_epoch(epoch_num, writer)

    net.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(validationLoader):
        vinputs, vlabels = vdata
        vout = net(vinputs)
        vloss = loss(vout, vlabels)
        running_vloss += vloss
    avg_vloss = running_vloss / (i+1)
    print(f"Loss train {avg_loss} validation {avg_vloss}")


    writer.add_scalars("Training Loss", {"Training" : avg_loss, "validation" : avg_vloss})
    writer.flush()

    epoch_num += 1

EPOCH 1
  batch 1000 loss: 1.5842277437448502
  batch 2000 loss: 1.5768979654312134
  batch 3000 loss: 1.5450552382469178
  batch 4000 loss: 1.3469123649001122
  batch 5000 loss: 1.2608326620459556
  batch 6000 loss: 1.225639100253582
  batch 7000 loss: 1.2060368137359618
  batch 8000 loss: 0.928632303237915
  batch 9000 loss: 1.1530471400022506
  batch 10000 loss: 1.1261671338677406
Loss train 1.1261671338677406 validation 1.104140043258667
EPOCH 2
  batch 1000 loss: 1.100673337340355
  batch 2000 loss: 1.0736198697686195
  batch 3000 loss: 1.0347667849063873
  batch 4000 loss: 1.0116595275700093
  batch 5000 loss: 0.9962882779240608
  batch 6000 loss: 0.9725821277499199
  batch 7000 loss: 0.9538533962965011
  batch 8000 loss: 0.7226288927346468
  batch 9000 loss: 0.8231359447538853
  batch 10000 loss: 0.7964871341437101
Loss train 0.7964871341437101 validation 0.8486955165863037
EPOCH 3
  batch 1000 loss: 0.9036184303462506
  batch 2000 loss: 0.8607337115108967
  batch 3000 loss: 0.8

In [101]:
# test model
actions = ["move", "turnRight", "turnLeft", "pickMarker", "putMarker", "finish"]

testMDP = MDP(dir = "data", type = "val", name = "100112")

def apply_to_grid(MDP, show_grid):
    reward = 0
    steps = 0
    if show_grid:
        MDP.print_grid()
    while True:
        current_grid = torch.tensor(MDP.get_current_state()) / 10
        out = net(current_grid.unsqueeze(0))
        nextAction = actions[torch.argmax(out)]
        reward += MDP.gamma**steps * MDP.reward(nextAction)
        steps += 1
        if MDP.get_next_state(nextAction) == "Terminal" or steps > 100:
            return reward, steps, MDP.task_solved()
        if show_grid:
            print("action: {}, reward: {}".format(nextAction, reward))
            MDP.print_grid()


apply_to_grid(testMDP, True)

[[['>' '.' '.' 'O']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]

 [['.' '.' '.' '^']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]]
action: move, reward: 0.0
[[['.' '>' '.' 'O']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]

 [['.' '.' '.' '^']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]]
action: move, reward: 0.0
[[['.' '.' '>' 'O']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]

 [['.' '.' '.' '^']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]]
action: move, reward: 0.0
[[['.' '.' '.' 'r']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]

 [['.' '.' '.' '^']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]]
action: turnLeft, reward: 0.0
[[['.' '.' '.' 'u']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]

 [['.' '.' '.' '^']
  ['#' '#' '.' '#']
  ['.' '#' '.' '#']
  ['.' '#' '#' '#']]]


(-0.625, 5, False)

In [102]:
# check accuracy of solved tasks
sucesses = 0
for dir in ["data", "data_easy", "data_medium"]:
    for type in ["val"]:
        for i in os.listdir(os.sep.join(["datasets", dir, type, "task"]))[:-4]:
            i = re.sub(r"\D", "", i)
            curr_MDP = MDP(dir, type, i)
            if apply_to_grid(curr_MDP, False)[-1]:
                sucesses += 1

print("tasks solved with only imitation learning:")
print(f"Total : {sucesses}, accuracy: {sucesses / len(valDataset)}")


tasks solved with only imitation learning:
Total : 2246, accuracy: 0.13035403366221707


As it is clearly seen here, imitation learning with the limited amount of training data is not able to solve most of the provided tasks. Therefore, in the following I will use the described PPO approach and after that will create the final model that uses imitation learning to initiate the model and then trains it with maskless PPO to achieve the best possible performance.