In [29]:
# Packages needed for this notebook
import gym
import gym_gridworld
import numpy as np
import time
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output # Used to clear the ouput of a Jupyter cell.

In [30]:
discount = 0.5
R = np.zeros((3,1))
P = np.zeros((3,3))

R[0] = -8 # For state A
R[1] = 0 # For state B
R[2] = 6 # For state C

# Enter the probabilities going from state i to state j
P[0] = [0, 1, 0] # for i=0 (state A)
P[1] = [0.25, 0.25, 0.5] # for i=1 (state B)
P[2] = [0.25, 0.75, 0] # for i=2 (state C)

In [31]:
V = np.linalg.inv(np.eye(3) - discount*P)@R # V = (I - discount*P)^-1 * R
print(V)

[[-7.82222222]
 [ 0.35555556]
 [ 5.15555556]]


In [38]:
class RandomAgent():
    
    def __init__(self, nA=4, nS=16):
        self.nA = nA # Number of actions
        self.nS = nS # Number of states
        
        # Uniform probabilites in each state.
        # That is, in each of the nS states
        # each of the nA actions has probability
        # 1/nA.
        self.probs = np.ones((nS,nA))/nA 

    def act(self, state, done):
        action = np.random.choice(self.nA, p=self.probs[state]) 
        return action # a random policy

In [40]:
def compute_action_value(env, discount, s, a, v):
    
    action_value = 0
    
    for p, next_s, reward, _ in env.P[s][a]:
        # Loop through all possible (s', r) pairs
        action_value += p*(reward + (discount * v[next_s]))
    
    return action_value

In [41]:
def value_iteration(env, discount, agent, v0, max_iter=1000, tol=1e-6):
    
    v = v0
    
    for i in range(max_iter): # Loop
        delta = 0
        for s in range(env.observation_space.n):
            vs = v[s]
            
            # Code for updating v[s]
            action_values = np.zeros(env.action_space.n)
            for a in range(env.action_space.n):
                # Compute action value for all actions
                action_values[a] = (env, discount, s, a, v)
                
            v[s] = np.max(action_values)
            
            delta = np.max([delta, np.abs(vs-v[s])])
            
        if (delta < tol): # Until delta < tol
            break
            
    return v    

In [42]:
def greedy_policy(env, discount, agent, v):
    
    # The new policy will be a_probs
    # We start by setting all probabilities to 0
    # Then when we have found the greedy action in a state, 
    # we change the probability for that action to 1.0.
    
    a_probs = np.zeros((env.observation_space.n, env.action_space.n)) 
    
    for s in range(env.observation_space.n):
        
        action_values = np.zeros(env.action_space.n)
        
        for a in range(env.action_space.n):
            # Compute action value for all actions
            action_values[a] = compute_action_value(env, discount, s, a, v)
            
        a_max = np.argmax(action_values) # A greedy action
        a_probs[s][a_max] = 1.0 # Always choose the greedy action!
        
    return a_probs

In [44]:
env = gym.make('FrozenLake8x8-v0')
agent = RandomAgent()
discount = 1

v0 = np.zeros(env.observation_space.n)
v = value_iteration(env, discount, agent, v0)
print(v.reshape(8,8))

ValueError: setting an array element with a sequence.

In [47]:
agent.probs = greedy_policy(env, discount, agent, v)

In [48]:
print(v[18])
print(agent.probs[37])

0.9264147800954421
[0. 1. 0. 0.]
