In [91]:
# imports
import os
import json
import re

import torch 
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import numpy as np

from MDP import MDP

In [92]:
# check torch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3080'

In [93]:
#load MDP
mdp = MDP(dir = "data_medium", type = "val", name = "100595")
# show grid
mdp.print_grid()
for a in ["move", "turnRight", "pickMarker", "move"]:
    mdp.get_next_state(a)
mdp.print_grid()

[[['v' '.' '.' '#']
  ['.' '#' '#' '#']
  ['.' '#' '.' '#']
  ['#' '#' '#' '#']]

 [['.' '.' '.' '#']
  ['O' '#' '#' '#']
  ['v' '#' '.' '#']
  ['#' '#' '#' '#']]]
[[['.' '.' '.' '#']
  ['<' '#' '#' '#']
  ['.' '#' '.' '#']
  ['#' '#' '#' '#']]

 [['.' '.' '.' '#']
  ['O' '#' '#' '#']
  ['v' '#' '.' '#']
  ['#' '#' '#' '#']]]


## Initialize Model with Imitation learning

In [94]:
# data tests
os.listdir("datasets/data/train")

['seq', 'task']

In [95]:
# create dataset

#numerical representation of actions
getNumAction = {
    "move" : 0,
    "turnRight": 1,
    "turnLeft" : 2,
    "pickMarker": 3,
    "putMarker" : 4,
    "finish" : 5
}

class Dataset(data.Dataset):
    """
    attributes:
    dir : str list :=  accepted directories (data, data_easy, data_medium)
    type : str list := train and/or val
    grid : tensor := a tensor of all available grids
    actions : tensor := vector of the optimal action for each
    """
    def data_generator(self):
        for dir in self.dir:
            for type in self.type:
                for i in os.listdir(os.sep.join(["datasets", dir, type, "task"]))[:-4]:
                    i = re.sub(r"\D", "", i)
                    # load MDP and optimal sequence
                    currMDP = MDP(dir = dir, type = type, name = str(i))

                    with open(os.sep.join(["datasets", dir, type, "seq", str(i) + "_seq.json"])) as seq:
                        sequence = json.load(seq)["sequence"]
                    
                    for action in sequence:
                        yield currMDP.get_current_state().copy(), action
                        currMDP.get_next_state(action)


    def __init__(self, dir = ["data", "data_easy", "data_medium"], type = ["train"]) -> None:
        """
        dir : str list :=  accepted directories (data, data_easy, data_medium)
        type : str list := train and/or val
        """
        self.dir = dir
        self.type = type
        lstActionsAndGrids = list(self.data_generator())
        self.grid = torch.tensor(np.array([x[0] for x in lstActionsAndGrids]) / 10)
        self.actions = torch.tensor(np.array([getNumAction[x[1]] for x in lstActionsAndGrids]))
    
    def __len__(self):
        return len(self.grid)

    def __getitem__(self, idx):
        return self.grid[idx], self.actions[idx]

trainDataset = Dataset()
valDataset = Dataset(type = ["val"])

In [96]:
trainDataset[0][0]

tensor([[[1.0000, 0.0000, 0.0000, 1.0000],
         [1.0000, 0.0000, 1.0000, 0.0000],
         [0.0000, 0.4000, 0.9000, 0.0000],
         [0.0000, 0.0000, 1.0000, 0.0000]],

        [[1.0000, 0.0000, 0.0000, 1.0000],
         [1.0000, 0.4000, 1.0000, 0.0000],
         [0.0000, 0.0000, 0.9000, 0.0000],
         [0.0000, 0.0000, 1.0000, 0.0000]]], dtype=torch.float64)

In [115]:
# create Neural Network

class Net(nn.Module):
    """
    input : 2 X 4 X 4 grid
    label : Move [0;6]
    """
    def __init__(self):
        super(Net, self).__init__()
        # first layer: input
        self.conv1 = nn.Conv2d(2, 8, 2)

        #second layer : 2nd convolution
        self.conv2 = nn.Conv2d(8, 16, 2)

        self.conv3 = nn.Conv2d(16, 32, 2)

        self.fc1 = nn.Linear(32, 64)

        self.out = nn.Linear(64, 6)


    def forward(self, x):
        x = x.float()

        x = F.relu(self.conv1(x))

        x = F.relu(self.conv2(x))

        x = F.relu(self.conv3(x))

        x = torch.flatten(x,start_dim=1)

        x = F.relu(self.fc1(x))

        x = self.out(x)
        #x = F.softmax(x, dim = 0)
        return x    

In [116]:
#creating model
net = Net()
print(net)

params = list(net.parameters())
print(f"number of parameters: {len(params)}")

#loss function
loss = nn.CrossEntropyLoss()

#optimizer
optimizer = torch.optim.Adam(net.parameters())
optimizer

Net(
  (conv1): Conv2d(2, 8, kernel_size=(2, 2), stride=(1, 1))
  (conv2): Conv2d(8, 16, kernel_size=(2, 2), stride=(1, 1))
  (conv3): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1))
  (fc1): Linear(in_features=32, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=6, bias=True)
)
number of parameters: 10


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [117]:
BATCH_SIZE = 32
dataloader = data.DataLoader(trainDataset, BATCH_SIZE)

def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0
    last_loss = 0

    for i, data in enumerate(dataloader):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = net(inputs)

        lossVal = loss(outputs, labels)
        lossVal.backward()

        optimizer.step()

        running_loss += lossVal.item()

        if i % 1000 == 999:
            last_loss = running_loss / 1000
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(dataloader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [118]:
timestamp = datetime.now().strftime("%d%m%Y_%H%M%S")
writer = SummaryWriter("runs/imitation_learning_{}".format(timestamp))
epoch_num = 0

EPOCHS = 13

best_vloss = 1000000

for epoch in range(EPOCHS):
    print("EPOCH {}".format(epoch_num + 1))

    net.train(True)
    avg_loss = train_one_epoch(epoch_num, writer)

    net.train(False)

    running_vloss = 0.0
    # TODO: implement validation


    writer.add_scalars("Training Loss", {"Training" : avg_loss})
    writer.flush()

    epoch_num += 1

EPOCH 1
  batch 1000 loss: 1.5727730317115785
  batch 2000 loss: 1.3680681443214417
  batch 3000 loss: 1.3102844228744508
  batch 4000 loss: 1.1738843717575074
  batch 5000 loss: 1.2002523643970489
EPOCH 2
  batch 1000 loss: 1.1881384137868882
  batch 2000 loss: 1.1391257843971252
  batch 3000 loss: 1.0889491180181503
  batch 4000 loss: 0.9354054371416569
  batch 5000 loss: 0.9163639705181121
EPOCH 3
  batch 1000 loss: 0.9582524493336677
  batch 2000 loss: 0.9409482676386833
  batch 3000 loss: 0.898476741194725
  batch 4000 loss: 0.8154534488022327
  batch 5000 loss: 0.737059116512537
EPOCH 4
  batch 1000 loss: 0.8632445269525051
  batch 2000 loss: 0.8447897163331508
  batch 3000 loss: 0.8374769319295883
  batch 4000 loss: 0.709097480431199
  batch 5000 loss: 0.6351571732163429
EPOCH 5
  batch 1000 loss: 0.8166584567129612
  batch 2000 loss: 0.7976168520152569
  batch 3000 loss: 0.7644486688673496
  batch 4000 loss: 0.6564892259538173
  batch 5000 loss: 0.5683356807678938
EPOCH 6
  bat

In [141]:
# test model
actions = ["move", "turnRight", "turnLeft", "pickMarker", "putMarker", "finish"]

testMDP = MDP(dir = "data", type = "val", name = "101778")

def apply_to_grid(MDP):
    reward = 0
    steps = 0
    MDP.print_grid()
    while True:
        current_grid = torch.tensor(MDP.get_current_state()) / 10
        out = net(current_grid.unsqueeze(0))
        nextAction = actions[torch.argmax(out)]
        reward = MDP.gamma * reward + MDP.reward(nextAction)
        steps += 1
        print("action: {}, reward: {}".format(nextAction, reward))
        if MDP.get_next_state(nextAction) == "Terminal":
            return reward, steps
        MDP.print_grid()


apply_to_grid(testMDP)

[[['#' '.' '.' '#']
  ['u' 'O' '.' '#']
  ['#' '.' '#' '.']
  ['.' '.' '.' '#']]

 [['#' '.' '.' '#']
  ['.' 'O' '>' '#']
  ['#' '.' '#' '.']
  ['.' '.' '.' '#']]]
action: pickMarker, reward: 1.0
[[['#' '.' '.' '#']
  ['^' 'O' '.' '#']
  ['#' '.' '#' '.']
  ['.' '.' '.' '#']]

 [['#' '.' '.' '#']
  ['.' 'O' '>' '#']
  ['#' '.' '#' '.']
  ['.' '.' '.' '#']]]
action: move, reward: 0.5


(0.5, 2)