In [1]:
from gym import Env
import gym
import pygame
from gym.spaces import Discrete, Box,Dict
import numpy as np
import random

In [None]:
class WarehouseAgent():
    def __init__(self):
        self.GRID_DIM = [7,6]

        self.agent_position = [1,2]

        self.box_location = [4,3]
        self.goal_location = [3,1]
        self._action_to_direction = {
            0: np.array([-1, 0]),
            1: np.array([1, 0]),
            2: np.array([0, -1]),
            3: np.array([0, 1]),
        }
        self._ACTIONLOOKUP = {
            0: 'move up',
            1: 'move down',
            2: 'move left',
            3: 'move right',
            4: 'push'
            }
        self.GRID_DIM = np.asarray(self.GRID_DIM)
        self.GRID = np.zeros(self.GRID_DIM ) # The Boundaries are the walls, so playing space is only [:-2,:-2] 
        self.GRID[:,[0,-1]] = 1
        self.GRID[[0,-1],:] = 1
        self.GRID[[1,2,5],3:5] = 1
        self.walls = 1
        self.action_space = Discrete(len(self._ACTIONLOOKUP.keys()))
        self.state_space = Discrete(self.GRID_DIM[0]*self.GRID_DIM[1])
        self.observation_space = Dict(
            {
                "agent": Box(np.array([0,0]), np.array([self.GRID_DIM[0]-1,self.GRID_DIM[1] - 1]), shape=(2,), dtype=int),
                'box' : Box( np.array([0,0]), np.array([self.GRID_DIM[0]-1,self.GRID_DIM[1] - 1]), shape=(2,), dtype=int),
                "target": Box( np.array([0,0]), np.array([self.GRID_DIM[0]-1,self.GRID_DIM[1] - 1]), shape=(2,), dtype=int),
            })
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location) 
            
#         print(self.GRID)
     

    
    def step(self, action):
        self._prev_agent_location = None
        self._prev_box_location = None
        moved_box = False

        if action<4:
            moved_player = self._move(action)
        else:
            moved_player, moved_box = self._push(action)
            
        done, reward = self.is_over()            
        observation = self._get_obs()
        info = self._get_info()
        
        return observation, reward, done, info      
        
            
            

        
    
    def render(self):
        rend = self.GRID.copy().astype(dtype='U1')
        rend[self._agent_location[0],self._agent_location[1]] = 'A'
        rend[self._box_location[0],self._box_location[1]] = 'B'
        rend[self._target_location[0],self._target_location[1]] = 'T'
        return rend
        

    def reset(self,seed = None, return_info = False, options = None):
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location)
        
        observation = self._get_obs()
        info = self._get_info()
        return (observation, info) if return_info else observation
    
    def _state_in_seq(self):
        m, n = self._agent_location
        seq = m * self.GRID.shape[1] + n
        return seq
    
    def _get_obs(self):
        return {"agent":self._agent_location,"box": self._box_location,"target":self._target_location}
    def _get_info(self):
        return {'distance': np.linalg.norm(self._box_location - self._target_location,ord = 1)}
    def _push(self,action):
        loc = self._box_location - self._agent_location
#         print(f'loc{loc}, box :{self._box_location}, agent:{self._agent_location}')
        push_dir = None
        for idx,val in enumerate(self._action_to_direction.values()):
            if np.array_equal(loc,val):
                valid = True
                push_dir = idx
                break
            else :
                valid = False
            
        if valid:
            self._prev_agent_location = self._agent_location
            self._prev_box_location = self._box_location
            self._box_location = self._box_location + self._action_to_direction[push_dir]
            if self.GRID[self._box_location[0],self._box_location[1]] == 1:
                self._box_location = self._prev_box_location
                return False, False
            else:
                self._agent_location = self._agent_location + self._action_to_direction[push_dir]
                return True, True
        
        return False, False
            
    def _move(self,action):
            self._prev_agent_location = self._agent_location
            self._prev_box_location = self._box_location
            self._agent_location = self._agent_location + self._action_to_direction[action]
#             print(self.GRID[self._agent_location],self._agent_location,self.GRID)
            if self.GRID[self._agent_location[0],self._agent_location[1]] == 1:
                self._agent_location = self._prev_agent_location
                return False
            elif np.array_equal(self._agent_location, self._box_location):
                self._agent_location = self._prev_agent_location
                return False
            return True
    def is_over(self):
        if np.array_equal(self._box_location, self._target_location):
            done = True
            reward = 0
        elif sum([True if self.GRID[(self._box_location + val)[0],(self._box_location + val)[1]] == 1 else False for val in self._action_to_direction.values()])>1 :
            done = True
            reward = -1
        else: 
            done = False
            reward = -1
        return done , reward
            
                
            
        
