In [1]:
# imports
import os
import json
import re

import torch 
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import numpy as np

from MDP import MDP

import stable_baselines3
import sb3_contrib

import gym

In [2]:
# check torch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3080'

In [3]:
# create Neural Network

class Net(nn.Module):
    """
    input : 2 X 4 X 4 grid
    label : Move [0;6]
    """
    def __init__(self):
        super(Net, self).__init__()
        # first layer: input
        self.conv1 = nn.Conv2d(2, 8, 2)

        #second layer : 2nd convolution
        self.conv2 = nn.Conv2d(8, 16, 2)

        self.conv3 = nn.Conv2d(16, 32, 2)

        self.fc1 = nn.Linear(32, 32)

        self.out = nn.Linear(32, 6)


    def forward(self, x):
        x = x.float()

        x = F.relu(self.conv1(x))

        x = F.relu(self.conv2(x))

        x = F.relu(self.conv3(x))

        x = torch.flatten(x,start_dim=1)

        x = F.relu(self.fc1(x))

        x = self.out(x)
    
        return x    

In [4]:
#creating model
net = Net()
net.cuda()
print(net)

params = list(net.parameters())
print(f"number of parameters: {len(params)}")

#loss function
loss = nn.CrossEntropyLoss()

#optimizer
optimizer = torch.optim.Adam(net.parameters())
optimizer

Net(
  (conv1): Conv2d(2, 8, kernel_size=(2, 2), stride=(1, 1))
  (conv2): Conv2d(8, 16, kernel_size=(2, 2), stride=(1, 1))
  (conv3): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1))
  (fc1): Linear(in_features=32, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=6, bias=True)
)
number of parameters: 10


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [5]:
net.load_state_dict(torch.load("Net"))

<All keys matched successfully>

In [6]:
#custom environment
from gym import spaces

class Gridworld(gym.Env):

    metadata = {"render.modes" : ["human"]}

    def __init__(self, dir = ["data", "data_easy", "data_medium"], type = ["train"], lambda1 = 0.01, lambda2 = 0.1, lambda3 = 1, load_optimal = False) -> None:
        super(Gridworld, self).__init__()
        self.action_space = spaces.Discrete(6)
        self.observation_space = spaces.Box(low = 0, high = 10, shape = (2, 4, 4))

        #available MDPs
        self.dir = dir
        self.type = type
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3

        self.actions = ["move", "turnLeft", "turnRight", "pickMarker", "putMarker","finish"]
        self.actionsDict = {"move" : 0, "turnLeft" : 1, "turnRight" : 2, "pickMarker" : 3, "putMarker" : 4, "finish" : 5}

        self.load_optimal = load_optimal

    def reset(self):
        nextDir, nextType = np.random.choice(self.dir), np.random.choice(self.type)
        self.nexti = np.random.choice(os.listdir(os.sep.join(["datasets", nextDir, nextType, "task"])))
        self.nexti = re.sub(r"\D", "", self.nexti)
        self.currentMDP = MDP(nextDir, nextType, self.nexti, lambda1= self.lambda1, lambda2 = self.lambda2, lambda3 =self.lambda3)

        # load optimal sequence if possible
        if self.load_optimal:
            try:
                with open(os.sep.join(["datasets", nextDir, nextType, "seq", self.nexti + "_seq.json"])) as grid:
                    grid = json.load(grid)
                    self.optimal_seq = grid["sequence"]
                    # current step
                    assert self.optimal_seq[-1] == "finish"
            except:
                self.optimal_seq = []
        self.steps = 0
        return self.currentMDP.get_current_state()

    def step(self, action):
        nextState, rew, done, info = self.currentMDP.sample_next_state_and_reward(self.actions[action])
        self.steps += 1
        if self.steps > 500:
            return nextState, -1, True, info 

        return nextState, rew -0.025, done, info 
        
    def render(self):
        self.currentMDP.print_grid()

    def close(self):
        pass
    
    def action_masks(self):
        # force agent to take optimal action if possible
        if self.load_optimal and self.optimal_seq:
            mask = np.zeros(6)
            mask[self.actionsDict[self.optimal_seq[self.steps]]] = 1
            return mask


        mat = self.currentMDP.get_current_state()
        if np.array_equal(mat[0], mat[1]):
            return np.array([0,0,0,0,0,1])
        
        return self.currentMDP.action_mask()

    # functions bellow are only used for inheritance 
    def get_MDP(self):
        return self.currentMDP

    def get_MDP_name(self):
        return self.nextDir, self.nextType, self.nexti

In [7]:
testEnv = Gridworld(dir = ["data"], load_optimal= True)

In [8]:
testEnv.reset()
testEnv.render()
print(testEnv.action_masks())
testEnv.step(np.argmax(testEnv.action_masks()))
testEnv.render()
testEnv.action_masks()

[[['#' '.' '#' '.']
  ['.' '.' '#' '#']
  ['O' '.' '#' '#']
  ['>' '.' '#' '#']]

 [['#' '.' '#' '.']
  ['.' '.' '#' '#']
  ['O' '.' '#' '#']
  ['.' '>' '#' '#']]]
[1. 0. 0. 0. 0. 0.]
[[['#' '.' '#' '.']
  ['.' '.' '#' '#']
  ['O' '.' '#' '#']
  ['.' '>' '#' '#']]

 [['#' '.' '#' '.']
  ['.' '.' '#' '#']
  ['O' '.' '#' '#']
  ['.' '>' '#' '#']]]


array([0., 0., 0., 0., 0., 1.])

In [9]:
np.random.seed(123)

def test_RL_models(model):
    for dir, num in zip(["data_easy", "data_medium", "data"], [80, 24, 480]):
        print("current data: " + dir)
        totalRew = 0
        valDataset = Gridworld(dir = [dir], type = ["val"], lambda1=0, lambda2=0, load_optimal= False)
        correct, total = 0,num*10
        for task in range(int(total)):
            if task % num == num-1:
                print(f"{(task+1) / total *100} %, running acc: {(correct*100)/(task+1)}, average reward: {totalRew / task}")
            currMDP = valDataset.reset()
            done = False
            steps = 0
            while not done and steps < 50:
                action = model.predict(currMDP, action_masks = valDataset.action_masks(), deterministic = True)[0]
                currMDP, rew, done, _ = valDataset.step(action)
                totalRew += rew
                if rew > 0:
                    correct += 1
                steps += 1
        

In [10]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from sb3_contrib.ppo_mask import MaskablePPO
from stable_baselines3.common.env_util import make_vec_env

import torch.nn.functional as F
from torch import nn

class CustomFeatureExtractorTorch(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space, features_dim: int = 32):
        super().__init__(observation_space, features_dim)

        self.conv1 = net.conv1
        self.conv2 = net.conv2
        self.conv3 = net.conv3
        

        #additional convolutions


    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = torch.flatten(x, start_dim=1)
        return x

net_arch = [
    32, 16, 8,
    dict(vf = [16, 8, 4], pi = [16, 8, 4])
]

policy_kwargs = dict(
    features_extractor_class = CustomFeatureExtractorTorch,
    net_arch = net_arch
)

In [11]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from sb3_contrib.ppo_mask import MaskablePPO
from stable_baselines3.common.env_util import make_vec_env
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy

In [12]:
net_arch = [
    128, 64, 16, 8,
    dict(vf = [8, 4, 2], pi = [8, 4, 2])
]

policy_kwargs = dict(
    features_extractor_class = CustomFeatureExtractorTorch,
    net_arch = net_arch
)

In [49]:
FinalEnv = make_vec_env(Gridworld, n_envs= 4, env_kwargs={"lambda1" : 0, "lambda2" : 0, "lambda3": 1, "dir" : ["data_easy", "generated_easy", "data_medium", "generated_med", "data"], "load_optimal" : True})

FinalModel = MaskablePPO(MaskableActorCriticPolicy, FinalEnv, policy_kwargs= policy_kwargs,  verbose = 1, n_steps= 500)

FinalModel.learn(1e5)

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.9     |
|    ep_rew_mean     | 0.33     |
| time/              |          |
|    fps             | 132      |
|    iterations      | 1        |
|    time_elapsed    | 15       |
|    total_timesteps | 2000     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 37.8         |
|    ep_rew_mean          | -0.0505      |
| time/                   |              |
|    fps                  | 154          |
|    iterations           | 2            |
|    time_elapsed         | 25           |
|    total_timesteps      | 4000         |
| train/                  |              |
|    approx_kl            | 0.0005245792 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.866       |
|    explained_variance   | -0.28       

<sb3_contrib.ppo_mask.ppo_mask.MaskablePPO at 0x7efe544a69d0>

In [None]:
FinalModel.learn(1e7)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 46.9     |
|    ep_rew_mean     | 3.76     |
| time/              |          |
|    fps             | 688      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2000     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 144         |
|    ep_rew_mean          | -0.269      |
| time/                   |             |
|    fps                  | 471         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4000        |
| train/                  |             |
|    approx_kl            | 0.008506195 |
|    clip_fraction        | 0.0579      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.659      |
|    explained_variance   | 0.542       |
|    learning_rate        | 0.

KeyboardInterrupt: 

In [50]:
test_RL_models(FinalModel)

current data: data_easy
10.0 %, running acc: 78.75, average reward: 0.4895569620253207
20.0 %, running acc: 76.25, average reward: 0.42216981132075326
30.0 %, running acc: 75.83333333333333, average reward: 0.40502092050206223
40.0 %, running acc: 75.9375, average reward: 0.40282131661437687
50.0 %, running acc: 75.5, average reward: 0.3913533834585942
60.0 %, running acc: 75.625, average reward: 0.39284968684754173
70.0 %, running acc: 75.53571428571429, average reward: 0.389713774597434
80.0 %, running acc: 76.40625, average reward: 0.409350547730773
90.0 %, running acc: 75.83333333333333, average reward: 0.39760083449234174
100.0 %, running acc: 74.5, average reward: 0.36921151439302985
current data: data_medium
10.0 %, running acc: 58.333333333333336, average reward: 0.03369565217391344
20.0 %, running acc: 66.66666666666667, average reward: 0.20531914893616826
30.0 %, running acc: 70.83333333333333, average reward: 0.28802816901408473
40.0 %, running acc: 71.875, average reward: 0

KeyboardInterrupt: 

In [51]:
FinalModel.save("RLModel")

In [52]:
testEnv = Gridworld(load_optimal= False)

In [116]:
curr = testEnv.reset()
testEnv.render()

[[['#' 'v' '#' '.']
  ['#' '.' '.' '.']
  ['#' 'O' '#' '#']
  ['#' '.' '.' '.']]

 [['#' '.' '#' '.']
  ['#' '.' '.' '.']
  ['#' '.' '#' '#']
  ['#' 'v' '.' '.']]]


In [121]:
a = FinalModel.predict(curr, action_masks= testEnv.action_masks(), deterministic= True)[0]
curr, _, done, _ = testEnv.step(a)
print(testEnv.action_masks())
testEnv.render()
if done:
    print("finished")

[0 0 0 0 0 1]
[[['#' '.' '#' '.']
  ['#' '.' '.' '.']
  ['#' '.' '#' '#']
  ['#' 'v' '.' '.']]

 [['#' '.' '#' '.']
  ['#' '.' '.' '.']
  ['#' '.' '#' '#']
  ['#' 'v' '.' '.']]]
finished
