In [7]:
import numpy as np
from gym import Env
import gym
from gym.spaces import Discrete, Box, Dict
import random

In [27]:
class WarehouseAgent:
    def __init__(self):
        self.GRID_DIM = [7, 6]

        self.agent_position = [1, 2]

        self.box_location = [4, 3]
        self.goal_location = [3, 1]
        self._action_to_direction = {
            0: np.array([-1, 0]),
            1: np.array([1, 0]),
            2: np.array([0, -1]),
            3: np.array([0, 1]),
        }
        self._ACTIONLOOKUP = {
            0: "move up",
            1: "move down",
            2: "move left",
            3: "move right",
            4: "push",
        }
        self.GRID_DIM = np.asarray(self.GRID_DIM)
        self.GRID = np.zeros(
            self.GRID_DIM
        )  # The Boundaries are the walls, so playing space is only [:-2,:-2]
        self.GRID[:, [0, -1]] = 1
        self.GRID[[0, -1], :] = 1
        self.GRID[[1, 2, 5], 3:5] = 1
        self.walls = 1
        self.action_space = Discrete(len(self._ACTIONLOOKUP.keys()))
        self.state_space = Discrete(self.GRID_DIM[0] * self.GRID_DIM[1])
        self.observation_space = Dict(
            {
                "agent": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
                "box": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
                "target": Box(
                    np.array([0, 0]),
                    np.array([self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1]),
                    shape=(2,),
                    dtype=int,
                ),
            }
        )
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location)

    #         print(self.GRID)

    def step(self, action):
        self._prev_agent_location = None
        self._prev_box_location = None
        moved_box = False

        if action < 4:
            moved_player = self._move(action)
        else:
            moved_player, moved_box = self._push(action)

        done, reward = self.is_over()
        observation = self._get_obs()
        info = self._get_info()

        return observation, reward, done, info

    def render(self):
        rend = self.GRID.copy().astype(dtype="U1")
        rend[self._agent_location[0], self._agent_location[1]] = "A"
        rend[self._box_location[0], self._box_location[1]] = "B"
        rend[self._target_location[0], self._target_location[1]] = "T"
        if np.array_equal(self._target_location, self._box_location):
            rend[self._target_location[0], self._target_location[1]] = "D"
        return print(rend)

    def reset(self, seed=None, return_info=False, options=None):
        self._agent_location = np.array(self.agent_position)
        self._box_location = np.array(self.box_location)
        self._target_location = np.array(self.goal_location)

        observation = self._get_obs()
        info = self._get_info()
        return (observation, info) if return_info else observation

    def _get_obs(self):
        return {
            "agent": self._agent_location,
            "box": self._box_location,
            "target": self._target_location,
        }

    def _get_info(self):
        return {
            "distance": np.linalg.norm(
                self._box_location - self._target_location, ord=1
            )
        }

    def _state_in_seq(self):
        m, n = self._agent_location
        seq = m * self.GRID.shape[1] + n
        return seq

    def _push(self, action):
        loc = self._box_location - self._agent_location
        #         print(f'loc{loc}, box :{self._box_location}, agent:{self._agent_location}')
        push_dir = None
        for idx, val in enumerate(self._action_to_direction.values()):
            if np.array_equal(loc, val):
                valid = True
                push_dir = idx
                break
            else:
                valid = False

        if valid:
            self._prev_agent_location = self._agent_location
            self._prev_box_location = self._box_location
            self._box_location = (
                self._box_location + self._action_to_direction[push_dir]
            )
            if self.GRID[self._box_location[0], self._box_location[1]] == 1:
                self._box_location = self._prev_box_location
                return False, False
            else:
                self._agent_location = (
                    self._agent_location + self._action_to_direction[push_dir]
                )
                return True, True

        return False, False

    def _move(self, action):
        self._prev_agent_location = self._agent_location
        self._prev_box_location = self._box_location
        self._agent_location = self._agent_location + self._action_to_direction[action]
        #             print(self.GRID[self._agent_location],self._agent_location,self.GRID)
        if self.GRID[self._agent_location[0], self._agent_location[1]] == 1:
            self._agent_location = self._prev_agent_location
            return False
        elif np.array_equal(self._agent_location, self._box_location):
            self._agent_location = self._prev_agent_location
            return False
        return True

    def is_over(self):
        if np.array_equal(
            self._box_location, self._target_location
        ):  # checking if the box is at the target already
            done = True
            reward = 0
        elif (
            sum(
                a := np.array(
                    [
                        True
                        if self.GRID[
                            (self._box_location + val)[0], (self._box_location + val)[1]
                        ]
                        == 1
                        else False
                        for val in self._action_to_direction.values()
                    ]
                )
            )
            >= 1
        ):
            # basically checking if there are atleast 1 wall adjacent to box
            if sum(a) > 1:
                done = True
                reward = -100
            elif sum(a) == 1:
                if ~(self._box_location - self._target_location).all():
                    done = False
                    reward = -1
                    return done, reward
                else:
                    #                 print(a)
                    direc = np.where(a == True)
                    #                 print(direc)
                    direc = direc[0][0]
                    left = self._box_location + self._action_to_direction[direc]
                    right = left.copy()
                    if direc in [0, 1]:
                        count = 0
                        while (self.GRID[left[0], left[1]] != 0) and (
                            self.GRID[right[0], right[1]] != 0
                        ):

                            left = np.clip(
                                left + self._action_to_direction[2],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            right = np.clip(
                                right + self._action_to_direction[3],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            count += 1
                            if count >= self.GRID_DIM[1]:
                                done = True
                                reward = -100
                                return done, reward
                                break
                    #  right = right + self._action_to_direction[3]

                    else:
                        count = 0
                        while (self.GRID[left[0], left[1]] != 0) and (
                            self.GRID[right[0], right[1]] != 0
                        ):
                            left = np.clip(
                                left + self._action_to_direction[1],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            right = np.clip(
                                right + self._action_to_direction[0],
                                [0, 0],
                                [self.GRID_DIM[0] - 1, self.GRID_DIM[1] - 1],
                            )
                            count += 1
                            if count >= self.GRID_DIM[0]:
                                done = True
                                reward = -100
                                return done, reward
                                break

                    done = False
                    reward = -1
                    return done, reward
        #         np.where([True if self.GRID[(self._box_location + val)[0], (self._box_location + val)[1] ] == 1 else False for val in self._action_to_direction.values() ] == True)[0][0]: # gotta check if the box is not adjacent to 2 walls but still is terminating state like the boundary walls
        else:
            done = False
            reward = -1
        return done, reward

In [28]:
env = WarehouseAgent()

#### Q learning

In [5]:
#Defining the hyper parameters
alpha = 0.8
gamma = 0.9
epsilon = 0.9
total_episodes = 500

#Initializing the Q-table with 0
Q = np.zeros((env.state_space.n,env.action_space.n))  # (total no. of states * total no. of actions)
Q.shape

NameError: name 'env' is not defined

In [6]:
#Function to choose the next action from a state
def choose_action(state):
    action=0
    # epsilon greedy
    p = np.random.random()
    if p < epsilon:  # choose random action among 4 actions
        x = (Q[state,:]!=0).all()
        if x:
            action = np.argmax(Q[state,:])
        else:
            action = np.where(Q[state,:]==0)[0]
            action = action[0]
            
    else:
        action = np.random.randint(env.action_space.n)
        
    return action
    
# Function to update the Q-table
def Qupdate(current_state, current_action, next_state, reward, next_max):
    Q[current_state, current_action] = Q[current_state, current_action] + alpha*(reward + (gamma * next_max)- Q[current_state, current_action])

In [None]:
#Initializing the reward
episodes_reward = []  # total sum of rewards in a episode, for each episodes

# Starting the Q-learning learning
for episode in range(total_episodes):
    env.reset()
    current_state = env._state_in_seq()    # initial state i.e reset to intial state for each episode  # env.reset()
    total_reward = 0
    
    done = False        # done is for terminated or not
    # loop in the episode until the environment not terminated
    while not done:
        current_action = choose_action(current_state)  # initial action for this initial_state
        # Getting the next state,reward
        observation, reward, done, info = env.step(current_action)   # Take one step in the environment
        next_state = env._state_in_seq()
        
        # max (S',ai) value
        next_max = np.max(Q[next_state])
        
        # Q table update
        Qupdate(current_state, current_action, next_state, reward, next_max)
        
        
        current_state = next_state

        total_reward += reward
    # At the end of learning process i.e termination
    episodes_reward.append(total_reward)

#### ON Policy Monte Carlo

![image.png](attachment:image.png)

In [54]:
#Defining the hyper parameters
gamma = 0.9
epsilon = 0.1
total_episodes = 2

#Initializing the Q-table with 0
Q = np.zeros((env.state_space.n,env.action_space.n))  # (total no. of states * total no. of actions)
policy = (Q.copy()+1)/env.action_space.n
Q.shape

(42, 5)

In [55]:
# creating Returns list, where each state has four possible actions to take
Returns = {}
for state in [str(s) for s in range(0,env.state_space.n)]:
    for action in [str(a) for a in range(0,env.action_space.n)]:
        Returns[state+", "+action] = []

In [None]:
for ep in range(max_episodes):
    G = 0
    
    env.reset()
    current_state = env._state_in_seq()  # state
    
    trajectory = []
    done = False
    while not done:
        current_action = np.random.choice(range(0,env.action_space.n),p=policy[current_state]) 
        
        observation, reward, done, info = env.step(current_action)   # Take one step in the environment
        next_state = env._state_in_seq()

        trajectory.append((current_state, current_action, reward))
        
        current_state = next_state
        if done:
            break

    
    for idx, step in enumerate(trajectory[::-1]):
        G = gamma*G + step[2]
        
        # first visit check
        if [step[0],step[1]] not in np.array(np.array(trajectory[::-1])[:,0:2][idx+1:]).tolist():
            Returns[str(step[0])+", "+str(step[1])].append(G)
            Q[step[0]][step[1]] = np.mean(Returns[str(step[0])+", "+str(step[1])])
            astar = np.argmax(Q[step[0]])
            for at in range(env.action_space.n):
                if at == astar:
                    policy[step[0]][at] = 1-epsilon+(epsilon/(env.action_space.n))
                else:
                    policy[step[0]][at] = epsilon/(env.action_space.n)

#### OFF Policy Monte Carlo

![image.png](attachment:image.png)

In [None]:
#Defining the hyper parameters
gamma = 0.9
epsilon = 0.1
total_episodes = 2

#Initializing the Q-table with 0
Q = np.zeros((env.state_space.n,env.action_space.n))  # (total no. of states * total no. of actions)
policy = np.argmax(Q,axis=1)
C = Q.copy()
Q.shape,policy.shape

In [None]:
for ep in range(max_episodes):
    env.reset()
    current_state = env._state_in_seq()  # state
    
    G = 0
    W = 1
    
    trajectory = []
    done = False
    while not done:
        action_by_b_policy = np.random.randint(0,env.action_space.n)
        
        observation, reward, done, info = env.step(action_by_b_policy)   # Take one step in the environment
        next_state = env._state_in_seq()

        trajectory.append((current_state, action_by_b_policy, reward))
        
        action_by_b_policy = next_state
        if done:
            break

    
    for idx, step in enumerate(trajectory[::-1]):
        G = gamma*G + step[2]
        C[step[0]][step[1]] += W
        Q[step[0]][step[1]] += (W/C[step[0]][step[1]]) * (G-Q[step[0]][step[1]])
        policy[step[0]] = np.argmax(Q[step[0]])
        if at != policy[step[0]]:
            break
                    
        W = W*(1/(1/env.action_space.n))