Should be final

In [1]:
from gym import Env
import gym
# import pygame
from gym.spaces import Discrete, Box, Dict
import numpy as np
import random

In [2]:
class WarehouseAgent:
    def __init__(self):
        self.GRID_DIM = [7, 6]

        self.agent_position = [1, 2]

        self.box_location = [4, 3]
        self.goal_location = [3, 1]
        self._action_to_direction = {
            0: np.array([-1, 0]),
            1: np.array([1, 0]),
            2: np.array([0, -1]),
            3: np.array([0, 1]),
        }
        self._ACTIONLOOKUP = {
            0: "move up",
            1: "move down",
            2: "move left",
            3: "move right",
            4: "push",
        }
        self.GRID_DIM = np.asarray(self.GRID_DIM)
        self.GRID = np.zeros(
            self.GRID_DIM
        )  # The Boundaries are the walls, so playing space is only [:-2,:-2]
        self.GRID[:, [0, -1]] = 1
        self.GRID[[0, -1], :] = 1
        self.GRID[[1, 2, 5], 3:5] = 1
        self.walls = 1
        self.action_space = Discrete(len(self._ACTIONLOOKUP.keys()))
        self.state_space = Discrete(self.GRID_DIM[0] * self.GRID_DIM[1])
        self.observation_space = Dict(
            {
                "agent": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
                "box": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
                "target": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
            }
        )
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location)

    #         print(self.GRID)

    def step(self, action):
        self._prev_agent_location = None
        self._prev_box_location = None
        moved_box = False

        if action < 4:
            moved_player = self._move(action)
        else:
            moved_player, moved_box = self._push(action)

        done, reward = self.is_over()
        observation = self._get_obs()
        info = self._get_info()

        return observation, reward, done, info

    def render(self):
        rend = self.GRID.copy().astype(dtype="U1")
        rend[self._agent_location[0], self._agent_location[1]] = "A"
        rend[self._box_location[0], self._box_location[1]] = "B"
        rend[self._target_location[0], self._target_location[1]] = "T"
        if np.array_equal(self._target_location, self._box_location):
            rend[self._target_location[0], self._target_location[1]] = "D"
        return print(rend)

    def reset(self, seed=None, return_info=False, options=None):
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location)

        observation = self._get_obs()
        info = self._get_info()
        return (observation, info) if return_info else observation

    def _get_obs(self):
        return {
            "agent": self._agent_location,
            "box": self._box_location,
            "target": self._target_location,
        }

    def _get_info(self):
        return {
            "distance": np.linalg.norm(
                self._box_location - self._target_location, ord=1
            )
        }

    def _state_in_seq(self):
        m, n = self._agent_location
        seq = m * self.GRID.shape[1] + n
        return seq

    def _push(self, action):
        loc = self._box_location - self._agent_location
        #         print(f'loc{loc}, box :{self._box_location}, agent:{self._agent_location}')
        push_dir = None
        for idx, val in enumerate(self._action_to_direction.values()):
            if np.array_equal(loc, val):
                valid = True
                push_dir = idx
                break
            else:
                valid = False

        if valid:
            self._prev_agent_location = self._agent_location
            self._prev_box_location = self._box_location
            self._box_location = (
                self._box_location + self._action_to_direction[push_dir]
            )
            if self.GRID[self._box_location[0], self._box_location[1]] == 1:
                self._box_location = self._prev_box_location
                return False, False
            else:
                self._agent_location = (
                    self._agent_location + self._action_to_direction[push_dir]
                )
                return True, True

        return False, False

    def _move(self, action):
        self._prev_agent_location = self._agent_location
        self._prev_box_location = self._box_location
        self._agent_location = self._agent_location + self._action_to_direction[action]
        #             print(self.GRID[self._agent_location],self._agent_location,self.GRID)
        if self.GRID[self._agent_location[0], self._agent_location[1]] == 1:
            self._agent_location = self._prev_agent_location
            return False
        elif np.array_equal(self._agent_location, self._box_location):
            self._agent_location = self._prev_agent_location
            return False
        return True

    def is_over(self):
        if np.array_equal(
            self._box_location, self._target_location
        ):  # checking if the box is at the target already
            done = True
            reward = 0
        elif (
            sum(
                a := np.array(
                    [
                        True
                        if self.GRID[
                            (self._box_location + val)[0], (self._box_location + val)[1]
                        ]
                        == 1
                        else False
                        for val in self._action_to_direction.values()
                    ]
                )
            )
            >= 1
        ):
            # basically checking if there are atleast 1 wall adjacent to box
            if sum(a) > 1:
                done = True
                reward = -1
            elif sum(a) == 1:
                if ~(self._box_location - self._target_location).all():
                    done = False
                    reward = -1
                    return done, reward
                else:
                    #                 print(a)
                    direc = np.where(a == True)
                    #                 print(direc)
                    direc = direc[0][0]
                    left = self._box_location + self._action_to_direction[direc]
                    right = left.copy()
                    if direc in [0, 1]:
                        count = 0
                        while (self.GRID[left[0], left[1]] != 0) and (
                            self.GRID[right[0], right[1]] != 0
                        ):

                            left = np.clip(
                                left + self._action_to_direction[2],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            right = np.clip(
                                right + self._action_to_direction[3],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            count += 1
                            if count >= self.GRID_DIM[1]:
                                done = True
                                reward = -1
                                return done, reward
                                break
                    #                         right = right + self._action_to_direction[3]

                    else:
                        count = 0
                        while (self.GRID[left[0], left[1]] != 0) and (
                            self.GRID[right[0], right[1]] != 0
                        ):
                            left = np.clip(
                                left + self._action_to_direction[1],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            right = np.clip(
                                right + self._action_to_direction[0],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            count += 1
                            if count >= self.GRID_DIM[0]:
                                done = True
                                reward = -1
                                return done, reward
                                break

                    done = False
                    reward = -1
                    return done, reward
        #         np.where([True if self.GRID[(self._box_location + val)[0], (self._box_location + val)[1] ] == 1 else False for val in self._action_to_direction.values() ] == True)[0][0]: # gotta check if the box is not adjacent to 2 walls but still is terminating state like the boundary walls
        else:
            done = False
            reward = -1
        return done, reward

In [3]:
env  = WarehouseAgent()

In [4]:
env.state_space.n

42

In [67]:
def ep_soft(env):
    policy = np.ones([env.state_space.n,env.action_space.n])/5
    return policy
    
def gen_epsiode(env,policy):
    env.reset()
    done = False
    episode = []
    count = 0
    while not done and count<100:
        s = env._state_in_seq()
        prob = policy[s]
        action = np.random.choice(range(env.action_space.n),p=prob)
        observation, reward, done ,_ = env.step(action)
        episode.append([s,action,reward])
        count+=1
    return episode
def visit_to_s(env,episode,Returns):
    visited = []
    R = None
    for ind,s in enumerate(episode):
        if s not in visited:
            act = s[1]
            for p in episode[ind:]:
                R+=p[2]
            Returns[ind,act] = R
        else:
            continue
    return Returns
def Q_val(Returns):
    Q = np.average(Returns,axis=2)
    return Q
            
def MC(env,ep = 0.1):
    Q = np.zeros([env.state_space.n,env.action_space.n])
    Returns = np.zeros([env.state_space.n,env.action_space.n])
    policy = ep_soft(env)
    for k in range(1):
        episode = gen_epsiode(env,policy)
        print(len(episode))
        Returns  = visit_to_s(env,episode,Returns)
        Q = Q_val(Returns)
        for ind,step in enumerate(episode):
            a_star = np.argmax(Q,axis = 1)
            for a in range(env.action_space.n):
                if a == a_star:
                    policy[step[0],step[a]] = (1 - ep) + (ep/env.action_space.n)
                else:
                    policy[step[0],step[a]] = (ep/env.action_space.n)
            
    
    return policy


In [68]:
env = WarehouseAgent()

In [69]:
# policy = ep_soft(env)
# gen_epsiode(env,policy)
MC(env)

100


TypeError: unsupported operand type(s) for +=: 'NoneType' and 'int'

In [23]:
np.average(np.zeros([env.state_space.n,env.action_space.n])+1,axis=1)
# np.zeros([env._state_in_seq(),env.action_space.n])+1
# env.state_space.n
a = np.array([[[2,2],[1,1]],[[2,2],[5,10]]])
a

array([[[ 2,  2],
        [ 1,  1]],

       [[ 2,  2],
        [ 5, 10]]])

In [28]:
np.argmax(np.average(a,axis=2),axis = 1)

array([0, 1], dtype=int64)

In [59]:
np.random.choice([1,2,3,5],p = [0.1,0.5,0.3,0.1])

3