In [1]:
from gym import Env
import gym
import pygame
from gym.spaces import Discrete, Box,Dict
import numpy as np
import random

In [48]:
class WarehouseAgent():
    def __init__(self):
        self.GRID_DIM = [7,6]

        self.agent_position = [1,2]

        self.box_location = [4,3]
        self.goal_location = [3,1]
        self._action_to_direction = {
            0: np.array([-1, 0]),
            1: np.array([1, 0]),
            2: np.array([0, -1]),
            3: np.array([0, 1]),
        }
        self._ACTIONLOOKUP = {
            0: 'move up',
            1: 'move down',
            2: 'move left',
            3: 'move right',
            4: 'push'
            }
        self.GRID_DIM = np.asarray(self.GRID_DIM)
        self.GRID = np.zeros(self.GRID_DIM ) # The Boundaries are the walls, so playing space is only [:-2,:-2] 
        self.GRID[:,[0,-1]] = 1
        self.GRID[[0,-1],:] = 1
        self.GRID[[1,2,5],3:5] = 1
        self.walls = 1
        self.action_space = Discrete(len(self._ACTIONLOOKUP.keys()))
        self.state_space = Discrete(self.GRID_DIM[0]*self.GRID_DIM[1])
        self.observation_space = Dict(
            {
                "agent": Box(np.array([0,0]), np.array([self.GRID_DIM[0]-1,self.GRID_DIM[1] - 1]), shape=(2,), dtype=int),
                'box' : Box( np.array([0,0]), np.array([self.GRID_DIM[0]-1,self.GRID_DIM[1] - 1]), shape=(2,), dtype=int),
                "target": Box( np.array([0,0]), np.array([self.GRID_DIM[0]-1,self.GRID_DIM[1] - 1]), shape=(2,), dtype=int),
            })
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location) 
            
#         print(self.GRID)
     

    
    def step(self, action):
        self._prev_agent_location = None
        self._prev_box_location = None
        moved_box = False

        if action<4:
            moved_player = self._move(action)
        else:
            moved_player, moved_box = self._push(action)
            
        done, reward = self.is_over()            
        observation = self._get_obs()
        info = self._get_info()
        
        return observation, reward, done, info      
        
            
            

        
    
    def render(self):
        rend = self.GRID.copy().astype(dtype='U1')
        rend[self._agent_location[0],self._agent_location[1]] = 'A'
        rend[self._box_location[0],self._box_location[1]] = 'B'
        rend[self._target_location[0],self._target_location[1]] = 'T'
        return rend
        

    def reset(self,seed = None, return_info = False, options = None):
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location)
        
        observation = self._get_obs()
        info = self._get_info()
        return (observation, info) if return_info else observation
        
    
    def _get_obs(self):
        return {"agent":self._agent_location,"box": self._box_location,"target":self._target_location}
    def _get_info(self):
        return {'distance': np.linalg.norm(self._box_location - self._target_location,ord = 1)}
    def _push(self,action):
        loc = self._box_location - self._agent_location
#         print(f'loc{loc}, box :{self._box_location}, agent:{self._agent_location}')
        push_dir = None
        for idx,val in enumerate(self._action_to_direction.values()):
            if np.array_equal(loc,val):
                valid = True
                push_dir = idx
                break
            else :
                valid = False
            
        if valid:
            self._prev_agent_location = self._agent_location
            self._prev_box_location = self._box_location
            self._box_location = self._box_location + self._action_to_direction[push_dir]
            if self.GRID[self._box_location[0],self._box_location[1]] == 1:
                self._box_location = self._prev_box_location
                return False, False
            else:
                self._agent_location = self._agent_location + self._action_to_direction[push_dir]
                return True, True
        
        return False, False
            
    def _move(self,action):
            self._prev_agent_location = self._agent_location
            self._prev_box_location = self._box_location
            self._agent_location = self._agent_location + self._action_to_direction[action]
#             print(self.GRID[self._agent_location],self._agent_location,self.GRID)
            if self.GRID[self._agent_location[0],self._agent_location[1]] == 1:
                self._agent_location = self._prev_agent_location
                return False
            elif np.array_equal(self._agent_location, self._box_location):
                self._agent_location = self._prev_agent_location
                return False
            return True
    def is_over(self):
        if np.array_equal(self._box_location, self._target_location):
            done = True
            reward = 0
        elif sum([True if self.GRID[(self._box_location + val)[0],(self._box_location + val)[1]] == 1 else False for val in self._action_to_direction.values()])>1 :
            done = True
            reward = -1
        else: 
            done = False
            reward = -1
        return done , reward
            
                
            
        


In [49]:
env = WarehouseAgent()

In [50]:
env._get_obs()

{'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}

In [51]:
env.step(1)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', 'A', '1', '1', '1'],
       ['1', 'T', '0', '0', '0', '1'],
       ['1', '0', '0', 'B', '0', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [52]:
env.step(1)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', 'A', '0', '0', '1'],
       ['1', '0', '0', 'B', '0', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [53]:
env.step(3)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', '0', 'A', '0', '1'],
       ['1', '0', '0', 'B', '0', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [54]:
env.step(3)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', '0', '0', 'A', '1'],
       ['1', '0', '0', 'B', '0', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [55]:
env.step(1)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', '0', '0', '0', '1'],
       ['1', '0', '0', 'B', 'A', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [56]:
env.step(4)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', '0', '0', '0', '1'],
       ['1', '0', 'B', 'A', '0', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [57]:
env.step(4)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', '0', '0', '0', '1'],
       ['1', 'B', 'A', '0', '0', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [58]:
env.step(1)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', '0', '0', '0', '1'],
       ['1', 'B', '0', '0', '0', '1'],
       ['1', '0', 'A', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [59]:
env.step(2)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', '0', '0', '0', '1'],
       ['1', 'B', '0', '0', '0', '1'],
       ['1', 'A', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [60]:
env.step(4)
env.render()

array([['1', '1', '1', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', 'T', '0', '0', '0', '1'],
       ['1', 'A', '0', '0', '0', '1'],
       ['1', '0', '0', '1', '1', '1'],
       ['1', '1', '1', '1', '1', '1']], dtype='<U1')

In [61]:
env._get_obs()

{'agent': array([4, 1]), 'box': array([3, 1]), 'target': array([3, 1])}

In [62]:
env.reset()
env._get_obs()

{'agent': array([1, 2]), 'box': array([4, 3]), 'target': array([3, 1])}

In [82]:
ep = 1
for eps in range(ep):
    rw = 0
    
    for i in range(1000):
        act = np.random.randint(0,5)
#         print(act)
        observation, reward, done, info = env.step(act)
        rw = rw + reward
#         print(act)
    print(rw)
    print(env.render())
    env.reset()

-1000
[['1' '1' '1' '1' '1' '1']
 ['1' '0' '0' '1' '1' '1']
 ['1' '0' '0' '1' '1' '1']
 ['1' 'T' 'A' '0' '0' '1']
 ['1' '0' '0' '0' 'B' '1']
 ['1' '0' '0' '1' '1' '1']
 ['1' '1' '1' '1' '1' '1']]


In [64]:
np.random.randint(5)

2

In [86]:
def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
 
    Args:
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
 
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state to action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities
    """
 
    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
 
    # The final action-value function.
    # A nested dictionary that maps state to (action to action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    # A nested dictionary that maps state to (action to number of times state-action pair was encountered).
    N = defaultdict(lambda: np.zeros(env.action_space.n))
    iterations = 0
    # policy improvement: this function holds a reference to the Q_values
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
    while iterations < num_episodes:
        done = False
        episode = []
        visited_states = {}
        s = env.reset()
        while not done:
            # choose an action based on a probability dist generated by policy(), epsilon/ |A(s)| chance of random action
            action = np.random.choice(range(env.action_space.n), p=policy(s))
            new_s, r, done, _ = env.step(action)
            episode.append((s, action, r))
        for state,action,reward in episode[::-1]:
            # first-visit monte carlo update
            if state not in visited_states:
                N[state][action] += 1
                # incremental update of Q value is more memory efficient than simply keeping a record of all rewards
                # and averaging after every new reward
                Q[state][action] += discount_factor * ( 1./ N[state][action] ) * (reward - Q[state][action])
                visited_states.add(state)
 
        iterations += 1
 
    return Q, policy

In [87]:
from collections import defaultdict
env = WarehouseAgent()
mc_control_epsilon_greedy(env,100,)

NameError: name 'make_epsilon_greedy_policy' is not defined