In [1]:
import numpy as np
from gym import Env
import gym
from gym.spaces import Discrete, Box, Dict
import random

In [2]:
class WarehouseAgent:
    def __init__(self):
        self.GRID_DIM = [7, 6]

        self.agent_position = [1, 2]

        self.box_location = [4, 3]
        self.goal_location = [3, 1]
        self._action_to_direction = {
            0: np.array([-1, 0]),
            1: np.array([1, 0]),
            2: np.array([0, -1]),
            3: np.array([0, 1]),
        }
        self._ACTIONLOOKUP = {
            0: "move up",
            1: "move down",
            2: "move left",
            3: "move right",
            4: "push",
        }
        self.GRID_DIM = np.asarray(self.GRID_DIM)
        self.GRID = np.zeros(
            self.GRID_DIM
        )  # The Boundaries are the walls, so playing space is only [:-2,:-2]
        self.GRID[:, [0, -1]] = 1
        self.GRID[[0, -1], :] = 1
        self.GRID[[1, 2, 5], 3:5] = 1
        self.walls = 1
        self.action_space = Discrete(len(self._ACTIONLOOKUP.keys()))
        self.state_space = Discrete(self.GRID_DIM[0] * self.GRID_DIM[1])
        self.observation_space = Dict(
            {
                "agent": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
                "box": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
                "target": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
            }
        )
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location)

    #         print(self.GRID)

    def step(self, action):
        self._prev_agent_location = None
        self._prev_box_location = None
        moved_box = False

        if action < 4:
            moved_player = self._move(action)
        else:
            moved_player, moved_box = self._push(action)

        done, reward = self.is_over()
        observation = self._get_obs()
        info = self._get_info()

        return observation, reward, done, info

    def render(self):
        rend = self.GRID.copy().astype(dtype="U1")
        rend[self._agent_location[0], self._agent_location[1]] = "A"
        rend[self._box_location[0], self._box_location[1]] = "B"
        rend[self._target_location[0], self._target_location[1]] = "T"
        if np.array_equal(self._target_location, self._box_location):
            rend[self._target_location[0], self._target_location[1]] = "D"
        return print(rend)

    def reset(self, seed=None, return_info=False, options=None):
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location)

        observation = self._get_obs()
        info = self._get_info()
        return (observation, info) if return_info else observation

    def _get_obs(self):
        return {
            "agent": self._agent_location,
            "box": self._box_location,
            "target": self._target_location,
        }

    def _get_info(self):
        return {
            "distance": np.linalg.norm(
                self._box_location - self._target_location, ord=1
            )
        }

    def _state_in_seq(self):
        m, n = self._agent_location
        seq = m * self.GRID.shape[1] + n
        return seq

    def _push(self, action):
        loc = self._box_location - self._agent_location
        #         print(f'loc{loc}, box :{self._box_location}, agent:{self._agent_location}')
        push_dir = None
        for idx, val in enumerate(self._action_to_direction.values()):
            if np.array_equal(loc, val):
                valid = True
                push_dir = idx
                break
            else:
                valid = False

        if valid:
            self._prev_agent_location = self._agent_location
            self._prev_box_location = self._box_location
            self._box_location = (
                self._box_location + self._action_to_direction[push_dir]
            )
            if self.GRID[self._box_location[0], self._box_location[1]] == 1:
                self._box_location = self._prev_box_location
                return False, False
            else:
                self._agent_location = (
                    self._agent_location + self._action_to_direction[push_dir]
                )
                return True, True

        return False, False

    def _move(self, action):
        self._prev_agent_location = self._agent_location
        self._prev_box_location = self._box_location
        self._agent_location = self._agent_location + self._action_to_direction[action]
        #             print(self.GRID[self._agent_location],self._agent_location,self.GRID)
        if self.GRID[self._agent_location[0], self._agent_location[1]] == 1:
            self._agent_location = self._prev_agent_location
            return False
        elif np.array_equal(self._agent_location, self._box_location):
            self._agent_location = self._prev_agent_location
            return False
        return True

    def is_over(self):
        if np.array_equal(
            self._box_location, self._target_location
        ):  # checking if the box is at the target already
            done = True
            reward = 0
        elif (
            sum(
                a := np.array(
                    [
                        True
                        if self.GRID[
                            (self._box_location + val)[0], (self._box_location + val)[1]
                        ]
                        == 1
                        else False
                        for val in self._action_to_direction.values()
                    ]
                )
            )
            >= 1
        ):
            # basically checking if there are atleast 1 wall adjacent to box
            if sum(a) > 1:
                done = True
                reward = -1
            elif sum(a) == 1:
                if ~(self._box_location - self._target_location).all():
                    done = False
                    reward = -1
                    return done, reward
                else:
                    #                 print(a)
                    direc = np.where(a == True)
                    #                 print(direc)
                    direc = direc[0][0]
                    left = self._box_location + self._action_to_direction[direc]
                    right = left.copy()
                    if direc in [0, 1]:
                        count = 0
                        while (self.GRID[left[0], left[1]] != 0) and (
                            self.GRID[right[0], right[1]] != 0
                        ):

                            left = np.clip(
                                left + self._action_to_direction[2],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            right = np.clip(
                                right + self._action_to_direction[3],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            count += 1
                            if count >= self.GRID_DIM[1]:
                                done = True
                                reward = -1
                                return done, reward
                                break
                    #                         right = right + self._action_to_direction[3]

                    else:
                        count = 0
                        while (self.GRID[left[0], left[1]] != 0) and (
                            self.GRID[right[0], right[1]] != 0
                        ):
                            left = np.clip(
                                left + self._action_to_direction[1],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            right = np.clip(
                                right + self._action_to_direction[0],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            count += 1
                            if count >= self.GRID_DIM[0]:
                                done = True
                                reward = -1
                                return done, reward
                                break

                    done = False
                    reward = -1
                    return done, reward
        #         np.where([True if self.GRID[(self._box_location + val)[0], (self._box_location + val)[1] ] == 1 else False for val in self._action_to_direction.values() ] == True)[0][0]: # gotta check if the box is not adjacent to 2 walls but still is terminating state like the boundary walls
        else:
            done = False
            reward = -1
        return done, reward

  and should_run_async(code)


In [3]:
env = WarehouseAgent()

In [4]:
# actions = ['a1','a2','a3','a4']     # 4 actions
# states = np.array([[1,2,3,4,5,6],  # 6by6 grid  => 36 states
#                  [7,8,9,10,11,12],
#                  [13,14,15,16,17,18],
#                  [19,20,21,22,23,24],
#                  [25,26,27,28,29,30],
#                  [31,32,33,34,35,36]])


# #total_actions = len(actions)
# states_size = states.shape[0]*states.shape[1]
# print(total_actions,states_size)

In [5]:
#Defining the hyper parameters
alpha = 0.8
gamma = 0.9
epsilon = 0.9
total_episodes = 10
max_steps = 10000

#Initializing the Q-table with 0
Q = np.ones((env.state_space.n,env.action_space.n))  # (total no. of states * total no. of actions)

In [6]:
#Function to choose the next action from a state
def choose_action(state):
    action=0
    # epsilon greedy
    p = np.random.random()
    if p < epsilon:  # choose random action among 4 actions
        action =  np.random.randint(Q.shape[1]) # action index           
    else:
        action = np.argmax(Q[state, :])  # index of maximum action-state value for a state
    
    return action
    
# Function to update the Q-table
def Qupdate(initial_state, initial_action, next_state, reward, next_action,Q):
    Q[initial_state, initial_action] = Q[initial_state, initial_action] + alpha*(reward + (gamma * Q[next_state, next_action])- Q[initial_state, initial_action])

In [7]:
#Initializing the reward
episodes_reward = []  # total sum of rewards in a episode, for each episodes

# Starting the SARSA learning
for episode in range(total_episodes):
    #env.reset()
    initial_state = env._state_in_seq()    # initial state i.e reset to intial state for each episode  # env.reset()
    initial_action = choose_action(initial_state)  # initial action for this initial_state
    total_reward = 0
    
    #done = False        # done is for terminated or not
    step = 0
    
    # loop in the episode until the environment not terminated
    while step < max_steps:   # if we put limit on no. of steps instead of termination or not  while not done and 
        
        # Getting the next state,reward
        observation, reward, done, info = env.step(initial_action)   # Take one step in the environment
        next_state = env._state_in_seq()

        # Choosing the next action
        next_action = choose_action(next_state)
        
        # Q table update
        Qupdate(initial_state, initial_action, next_state, reward, next_action,Q)

        initial_state = next_state
        initial_action = next_action
        
        total_reward += reward
        step = step+1
        
    # At the end of learning process i.e termination
    episodes_reward.append(total_reward)        
    #print(Q)

In [8]:
step

10000

In [9]:
act = np.argmax(Q,axis = 1)

In [10]:
print(act)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0
 0 0 0 0 0]


In [11]:
len(act)

42

In [12]:
env.reset()

{'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}

In [13]:
env._state_in_seq()

8

In [14]:
for a in act[8:]:
    print(env.step(a))

({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}, -1, False, {'distance': 3.0})
({'agent': array([1, 2]), 'box': array([4, 3]), 'target': array(

In [15]:
Q

array([[  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [-10.        , -10.        , -10.        , -10.        ,
        -10.        ],
       [-10.        , -10.        , -10.        , -10.        ,
        -10.        ],
       [  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [  1.        ,   1.        ,   1.        ,   1.        ,
          1.        ],
       [  1.        ,   1.        ,   1.   