In [1]:
import numpy as np

## Part 1

Consider a simple 5 × 5 gridworld problem, described below. This is the simplest abstraction of a reinforcement learning problem that allows us to benchmark and compare various learning algorithms to one another and is known as the ‘gridworld’ environment.

![image.png](attachment:image.png)

Each of the 25 cells of the gridworld represent a possible state of the world. An agent in the gridworld environment can take a step up, down, left or right. If the agent attempts to step off the grid, the location of the agent remains unchanged.

The blue, green, red and yellow squares represent special states at which the behaviour of the system is as follows. At the blue square, any action yields a reward of 5 and causes the agent to jump to the red square. At the green square, any action yields a reward of 2.5 and causes the agent to jump to either the yellow square or the red square with probability 0.5.

An attempt to step off the grid yields a reward of −0.5 and any move from a white square to another white square yields a reward of 0. Intuitively, an agent with a good policy should try to find the states with a high value, and exploit the rewards available at those states.

1. Consider a reward discount of γ = 0.95 and a policy which simply moves to one of the four directions with equal probability of 0.25. Estimate the value function for each of the states using (1) solving the system of Bellman equations explicitly (2) iterative policy evaluation (3) value iteration. Which states have the highest value? Does this surprise you?

2. Determine the optimal policy for the gridworld problem by (1) explicitly solving the Bellman optimality equation (2) using policy iteration with iterative policy evaluation (3) policy improvement with value iteration.


In [2]:
Grid_size = 5

gridworld = np.zeros((Grid_size, Grid_size)) #Row, Columns

#Blue Square
gridworld[0][1] = 5 #Jump to Red

#Green Square
gridworld[0][4] = 2.5 #Jump to Yellow or Red with probability 0.5

step = np.array([[-1,0],[1,0],[0,-1],[0,1]]) #Up, #Down, #Left, #Right

Yellow_Red = np.array([[4,4],[3,2]])

gamma = 0.95

In [3]:
def Reward_And_Transition(Current_Step, Action):

    if Current_Step == [0,1]:
        Next_Step = [3,2]
        Reward = 5
        return  Next_Step, Reward
    
    elif Current_Step == [0,4]:
        Next_Step = Yellow_Red[np.random.randint(2)]
        Reward = 2.5
        return  Next_Step, Reward

    Next_Step = Current_Step + Action

    if Next_Step[0] < 0 or Next_Step[0] > 4 or Next_Step[1] < 0 or Next_Step[1] > 4:
        Reward = -0.5
        return Current_Step, Reward
    Reward = 0

    return Next_Step, Reward   

#### (1) Bellman equations

In [57]:
A = np.zeros((Grid_size**2, Grid_size**2))

b = np.zeros(Grid_size**2)

for i in range(Grid_size):
    for j in range(Grid_size):
        b_aux = 0
        for k in range(step.shape[0]): 
            New_Step, Reward = Reward_And_Transition([i,j], step[k])
            b_aux += Reward
            if i == 0 and j == 4: #This if is because of the 0.5 probability from the green block
                A[Grid_size*i + j, Grid_size*3 + 2] += gamma * 0.5 * 0.25
                A[Grid_size*i + j, Grid_size*4 + 4] += gamma * 0.5 * 0.25
            else:
                A[Grid_size*i + j, Grid_size*New_Step[0] + New_Step[1]] += gamma * 0.25 
        b_aux *= 0.25
        b[Grid_size*i+j] = b_aux

In [63]:
V = np.linalg.solve(A, b).reshape((Grid_size,Grid_size))
V.round(decimals=5)

array([[-1.05263,  4.     , -0.7193 ,  1.45614, -5.26316],
       [-2.94737, -2.75439, -5.26316,  4.     ,  8.01754],
       [ 6.22807,  4.21053, -0.52632, -4.21053, -7.2807 ],
       [-8.01754, -2.94737,  5.26316,  3.80702,  2.94737],
       [ 4.21053, -1.45614, -0.33333, -4.     ,  0.     ]])

In [83]:
np.max(V), np.argwhere(V == np.max(V))[0]

(8.017543859649125, array([1, 4], dtype=int64))

#### (2) Iterative Policy Evaluation

$V(s) \lhd \sum \pi(a|s) \sum P(s'|s,a)[R(s,a,s') + \gamma V(s')]$ 

In [15]:
Value_Policy = np.zeros((Grid_size, Grid_size))
threshold = 0.0001

while 1:
    delta = 0
    for i in range(Grid_size):
        for j in range(Grid_size):
            Value_Policy_Current = Value_Policy[i,j]
            aux = 0
            #There are 4 directions            
            for k in range(step.shape[0]): 
                New_Step, Reward = Reward_And_Transition([i,j], step[k])
                aux += 0.25 * (Reward + gamma * Value_Policy[New_Step[0], New_Step[1]])
            Value_Policy[i,j] = aux
            delta = max(delta, abs(Value_Policy_Current - aux))
    if delta < threshold:
        break

In [66]:
Value_Policy

array([[ 2.1710055 ,  4.73364232,  2.07036304,  1.26530797,  1.7790464 ],
       [ 1.11815949,  1.78220762,  1.17415628,  0.73917291,  0.56241467],
       [ 0.16297209,  0.47800865,  0.35202596,  0.1104237 , -0.18624681],
       [-0.54673687, -0.28459072, -0.28037911, -0.43997606, -0.7444249 ],
       [-1.10758287, -0.84921487, -0.80797594, -0.93807805, -1.23736921]])

In [82]:
np.max(Value_Policy), np.argwhere(Value_Policy == np.max(Value_Policy))[0]

(4.733642320167359, array([0, 1], dtype=int64))

![image.png](attachment:image.png)

#### (3) Value Iteration

In [75]:
Value_Iteration = np.zeros((Grid_size, Grid_size))
threshold = 1e-10

while 1:
    delta = 0
    for i in range(Grid_size):
        for j in range(Grid_size):
            Value_Policy_Current = Value_Iteration[i,j]
            aux = 0
            #There are 4 directions            
            for k in range(step.shape[0]): 
                New_Step, Reward = Reward_And_Transition([i,j], step[k])
                aux = max(aux, 0.25 * (Reward + gamma * Value_Iteration[New_Step[0], New_Step[1]]))
            Value_Iteration[i,j] = aux
            delta = max(delta, abs(Value_Policy_Current - aux))
    if delta < threshold:
        break

In [76]:
Value_Iteration.round(decimals=5)

array([[2.97100e-01, 1.25095e+00, 2.97100e-01, 1.48660e-01, 6.25950e-01],
       [7.05600e-02, 2.97100e-01, 7.05600e-02, 3.53100e-02, 1.48660e-01],
       [1.67600e-02, 7.05600e-02, 1.67600e-02, 8.39000e-03, 3.53100e-02],
       [3.98000e-03, 1.67600e-02, 3.98000e-03, 1.99000e-03, 8.39000e-03],
       [9.50000e-04, 3.98000e-03, 9.50000e-04, 4.70000e-04, 1.99000e-03]])

In [81]:
np.max(Value_Iteration), np.argwhere(Value_Iteration == np.max(Value_Iteration))[0]

(1.2509452710982731, array([0, 1], dtype=int64))