In [1]:
# imports
import os
import json
import re

import torch 
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import numpy as np

from MDP import MDP

import stable_baselines3
import sb3_contrib

import gym

In [2]:
# check torch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3080'

In [3]:
# create Neural Network

class Net(nn.Module):
    """
    input : 2 X 4 X 4 grid
    label : Move [0;6]
    """
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv2d(11, 32, 2, padding = 1)

        self.conv2 = nn.Conv2d(32, 64, 3, padding= 1)

        self.conv3 = nn.Conv2d(64, 128, 3)

        self.conv4 = nn.Conv2d(128, 256, 3)

        self.fc1 = nn.Linear(256, 128)

        self.fc2 = nn.Linear(128, 64)

        self.out = nn.Linear(128, 6)


    def forward(self, x):
        x = x.float()

        x = F.relu(self.lstm(x))

        x = F.relu(self.conv1(x))

        x = F.relu(self.conv2(x))

        x = F.relu(self.conv3(x))

        x = F.relu(self.conv4(x))

        x = torch.flatten(x,start_dim=1)

        x = F.relu(self.fc1(x))

        x = F.relu(self.fc2(x))

        x = self.out(x)
    
        return x   

In [4]:
#creating model
net = Net()
net.cuda()
print(net)

params = list(net.parameters())
print(f"number of parameters: {len(params)}")

#loss function
loss = nn.CrossEntropyLoss()

#optimizer
optimizer = torch.optim.Adam(net.parameters())
optimizer

Net(
  (conv1): Conv2d(11, 32, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (conv4): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (out): Linear(in_features=128, out_features=6, bias=True)
)
number of parameters: 14


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [5]:
#custom environment
from gym import spaces

class Gridworld(gym.Env):

    metadata = {"render.modes" : ["human"]}

    def __init__(self, episodes, dir = ["data_easy", "generated_easy", "data_medium", "generated_med", "data", "generated_imitation"], type = "train", 
                p = [0.1, 0.1, 0.15, 0.1, 0.15, 0.4], lambda1 = 0.01, lambda2 = 0.1, lambda3 = 1, load_optimal = False, epsilon = 0.1) -> None:
        super(Gridworld, self).__init__()
        self.action_space = spaces.Discrete(6)
        self.observation_space = spaces.Box(low = 0, high = 1, shape = (11, 4, 4))

        #available MDPs
        self.dir = dir
        self.type = type
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3

        self.actions = ["move", "turnLeft", "turnRight", "pickMarker", "putMarker","finish"]
        self.actionsDict = {"move" : 0, "turnLeft" : 1, "turnRight" : 2, "pickMarker" : 3, "putMarker" : 4, "finish" : 5}

        self.epsilon = epsilon

        self.load_optimal = load_optimal

        #generate curriculum
        #{"data_easy" : 15% ,"generated_easy" : 20% , "data_medium" : 15% , "generated_med" : 15% "data" : 30%, "generated_imitation" : 15%}
        self.curriculum = []
        #data_easy
        for directory,amount in zip(dir, p):
            tasks = list(np.random.choice(os.listdir(os.sep.join(["datasets", directory, type, "task"])), int(episodes * amount)))
            tasks = zip([directory for _ in range(int(episodes * amount))], tasks)
            self.curriculum += tasks
        self.currentTask = 0
    

    def construct_feature_matrices(self, matrix):
        """
        creates the input for the Neural Network: 11 x 4 x 4 boolean tensor for:
        - if a wall is present
        - if a marker must be picked up
        - if a marker must be put down
        - if the agent facing left, down, right, up in the pregrid (one for each direction)
        - if the agent facing left, down, right, up in the postgrid (one for each direction)
        """
        wallMat = matrix[0] == 10

        marker_post = (matrix[0] > 4) & (matrix[0] < 10)
 
        marker_pre = (matrix[1] > 4) & (matrix[1] < 10)

        #using a xor operation
        marker_pick = marker_post & np.logical_not(marker_pre)
        marker_put = marker_pre & np.logical_not(marker_post)

        #print("pick")
        #print(marker_pick)
        #print("put")
        #print(marker_put)


        agent_pre = np.zeros((4,4,4))
        agent_dir = (matrix[(0, *self.currentMDP.agentPosition)] - 1) % 4
        agent_pre[(int(agent_dir), *self.currentMDP.agentPosition)] = 1
        
        agent_post = np.zeros((4,4,4))
        agent_pos_post = ((matrix[1] > 0) & (matrix[1] < 9)).nonzero()
        agent_dir = (matrix[(1, *agent_pos_post)] - 1) % 4
        agent_post[(int(agent_dir), *agent_pos_post)] = 1
        
        return np.array([wallMat, marker_pick, marker_put, *agent_pre, *agent_post])


    def reset(self):
        nextDir, self.nexti = self.curriculum[self.currentTask]
        self.nexti = re.sub(r"\D", "", self.nexti)
        self.currentTask += 1
        self.currentMDP = MDP(nextDir, self.type, self.nexti, lambda1= self.lambda1, lambda2 = self.lambda2, lambda3 =self.lambda3)

        # load optimal sequence if possible
        try:
            with open(os.sep.join(["datasets", nextDir, self.type, "seq", self.nexti + "_seq.json"])) as grid:
                grid = json.load(grid)
                self.optimal_seq = grid["sequence"]
                # current step
                assert self.optimal_seq[-1] == "finish"
        except:  
            self.optimal_seq = []
        self.steps = 0

        #with probability 1 - epsilon use either the optimal sequence or better masks
        self.use_optimal =  (np.random.rand() < (1 - self.epsilon)) and self.load_optimal

        return self.construct_feature_matrices(self.currentMDP.get_current_state())

    def step(self, action):
        nextState, rew, done, info = self.currentMDP.sample_next_state_and_reward(self.actions[action])
        self.steps += 1
        if self.steps > 500:
            return nextState, -self.lambda3, True, info 

        return self.construct_feature_matrices(nextState), rew - 0.01, done, info 
        
    def render(self):
        self.currentMDP.print_grid()

    def close(self):
        pass
    
    def action_masks(self):

        mat = self.currentMDP.get_current_state()
        if np.array_equal(mat[0], mat[1]):
            return np.array([0,0,0,0,0,1])

        # force agent to take optimal action if possible
        mask = self.currentMDP.action_mask()

        if self.use_optimal and self.optimal_seq:
            mask = np.zeros(6)
            mask[self.actionsDict[self.optimal_seq[self.steps]]] = 1
            return mask
        
        return mask

    # functions bellow are only used for inheritance 
    def get_MDP(self):
        return self.currentMDP

    def get_MDP_name(self):
        return self.nextDir, self.nextType, self.nexti

In [6]:
testEnv = Gridworld(100, dir = ["data"], load_optimal= True)

In [7]:
m = testEnv.reset()
testEnv.render()
print(testEnv.action_masks())
m, r, _, _ = testEnv.step(np.argmax(testEnv.action_masks()))
print(r)
testEnv.render()
testEnv.action_masks()
_, r, _, _ = testEnv.step(np.argmax(testEnv.action_masks()))
print(r)
testEnv.render()

[[['.' '#' '.' '.']
  ['O' '.' '<' '#']
  ['#' '#' '#' '#']
  ['.' '#' '.' '#']]

 [['^' '#' '.' '.']
  ['.' '.' '.' '#']
  ['#' '#' '#' '#']
  ['.' '#' '.' '#']]]
[1. 0. 0. 0. 0. 0.]
0.0
[[['.' '#' '.' '.']
  ['O' '<' '.' '#']
  ['#' '#' '#' '#']
  ['.' '#' '.' '#']]

 [['^' '#' '.' '.']
  ['.' '.' '.' '#']
  ['#' '#' '#' '#']
  ['.' '#' '.' '#']]]
0.0
[[['.' '#' '.' '.']
  ['l' '.' '.' '#']
  ['#' '#' '#' '#']
  ['.' '#' '.' '#']]

 [['^' '#' '.' '.']
  ['.' '.' '.' '#']
  ['#' '#' '#' '#']
  ['.' '#' '.' '#']]]


In [8]:
np.random.seed(123)

def test_RL_models(model):
    totalCorrect, totalOptimal = 0,0
    for dir, num in zip(["data_easy", "data_medium", "data"], [80, 24, 480]):
        print("current data: " + dir)
        total_steps = 0
        valDataset = Gridworld(num * 10, dir = [dir], p = [1], type = "val", lambda1=0, lambda2=0, load_optimal= False)
        correct, total, optimal = 0, num*10, 0
        for task in range(int(total)):
            if task % num == num-1:
                print(f"{(task+1) / total *100} %, running acc: {(correct*100)/(task+1)}, task solved optimaly: {optimal*100/task} %, average steps to solve: {total_steps / task}")
            currMDP = valDataset.reset()
            lenOptimalSeq = len(valDataset.optimal_seq)
            done = False
            steps = 0
            while not done and steps < 50:
                action = model.predict(currMDP, action_masks = valDataset.action_masks(), deterministic = False)[0]
                currMDP, rew, done, _ = valDataset.step(action)
                steps += 1
                if rew > 0:
                    correct += 1
                    total_steps += steps
                    if steps == lenOptimalSeq:
                        optimal += 1
        totalCorrect += correct
        totalOptimal += optimal
    print(f"Total Accuracy : {totalCorrect*100/5840}, Solved Optimally : {totalOptimal *100 / 5840}")

In [9]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from sb3_contrib.ppo_mask import MaskablePPO
from stable_baselines3.common.env_util import make_vec_env

import torch.nn.functional as F
from torch import nn

class CustomFeatureExtractorTorch(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space, features_dim: int = 64):
        super().__init__(observation_space, features_dim)

        self.conv1 = net.conv1
        self.conv2 = net.conv2
        self.conv3 = net.conv3
        self.conv4 = net.conv4
        
        self.fc1 = net.fc1
        self.fc2 = net.fc2
  
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))

        x = torch.flatten(x, start_dim=1)
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return x

net_arch = [
    64, 128, 64,
    dict(vf = [64, 32, 18], pi = [32, 16, 6])
]

policy_kwargs = dict(
    features_extractor_class = CustomFeatureExtractorTorch,
    net_arch = net_arch
)

In [10]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from sb3_contrib.ppo_mask import MaskablePPO
from stable_baselines3.common.env_util import make_vec_env
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy

from stable_baselines3.ppo import PPO
from stable_baselines3.common.policies import ActorCriticPolicy

In [11]:
episodes = 5 * 1e5
 
FinalEnv = make_vec_env(Gridworld, n_envs= 2  , 
    env_kwargs={"episodes" : episodes, "lambda1" : 0.1, "lambda2" : 0.5, "lambda3": 1, 
            "dir" : ["data_easy", "data_medium", "data", "generated_imitation", "generated_easy", "generated_med"], "load_optimal" : True, "epsilon" : 0.5})

FinalModel = MaskablePPO(MaskableActorCriticPolicy, FinalEnv,  policy_kwargs = policy_kwargs, verbose = 1, n_steps= 500)#

FinalModel.learn(episodes)

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.5      |
|    ep_rew_mean     | 0.965    |
| time/              |          |
|    fps             | 1110     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1000     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 58.8       |
|    ep_rew_mean          | -7.48      |
| time/                   |            |
|    fps                  | 744        |
|    iterations           | 2          |
|    time_elapsed         | 2          |
|    total_timesteps      | 2000       |
| train/                  |            |
|    approx_kl            | 0.01174622 |
|    clip_fraction        | 0.0646     |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.24      |
|    explained_variance   | -0.0717    |
|    learning_rate        |

KeyboardInterrupt: 

In [821]:
test_RL_models(FinalModel)

current data: data_easy
10.0 %, running acc: 97.5, task solved optimaly: 69.62025316455696 %, average steps to solve: 4.3924050632911396
20.0 %, running acc: 97.5, task solved optimaly: 72.95597484276729 %, average steps to solve: 4.40251572327044
30.0 %, running acc: 98.33333333333333, task solved optimaly: 70.2928870292887 %, average steps to solve: 4.514644351464435
40.0 %, running acc: 98.4375, task solved optimaly: 70.21943573667711 %, average steps to solve: 4.263322884012539
50.0 %, running acc: 98.25, task solved optimaly: 68.92230576441102 %, average steps to solve: 4.37844611528822
60.0 %, running acc: 98.54166666666667, task solved optimaly: 69.51983298538622 %, average steps to solve: 4.288100208768268
70.0 %, running acc: 98.39285714285714, task solved optimaly: 69.94633273703042 %, average steps to solve: 4.264758497316637
80.0 %, running acc: 98.59375, task solved optimaly: 70.57902973395932 %, average steps to solve: 4.294209702660407
90.0 %, running acc: 98.61111111111

KeyboardInterrupt: 

In [13]:
FinalModel.save("RLModel-onlyForbiddenActionMasks")

In [822]:
testEnv = Gridworld(100,load_optimal= False, dir = ["data"])

In [823]:
curr = testEnv.reset()
testEnv.render()

[[['#' '.' '.' '.']
  ['#' '#' 'O' '.']
  ['.' '.' '>' 'O']
  ['.' '#' '.' '#']]

 [['#' '.' '^' '.']
  ['#' '#' '.' '.']
  ['.' '.' '.' 'O']
  ['.' '#' '.' '#']]]


In [870]:
a = FinalModel.predict(curr, action_masks= testEnv.action_masks(), deterministic= False)[0]
curr, _, done, _ = testEnv.step(a)
print(testEnv.action_masks())
testEnv.render()
if done:
    print("finished")

[0. 1. 1. 0. 1. 0.]
[[['#' '.' '.' '.']
  ['#' '#' 'O' '.']
  ['^' '.' '.' 'O']
  ['.' '#' '.' '#']]

 [['#' '.' '^' '.']
  ['#' '#' '.' '.']
  ['.' '.' '.' 'O']
  ['.' '#' '.' '#']]]
