In [87]:
# Packages needed for this notebook
import gymnasium as gym
import gym_RLcourse
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output # Used to clear the ouput of a Jupyter cell.

In [88]:
class RandomAgent():
    
    def __init__(self, nA=4, nS=64):
        self.nA = nA # Number of actions
        self.nS = nS # Number of states
        
        # Uniform probabilites in each state.
        # That is, in each of the nS states
        # each of the nA actions has probability
        # 1/nA.
        self.probs = np.ones((nS,nA))/nA 

    def act(self, state):
        action = np.random.choice(self.nA, p=self.probs[state]) 
        return action # a random policy

In [89]:
def run_agent(env, agent):
    state, info = env.reset()
    time_step = 0
    total_reward = 0
    truncated = False
    terminated = False
    while not truncated and not terminated:
        action = agent.act(state);
        state, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        time_step += 1
        
        clear_output(wait=True)
        print("Time step:", time_step)
        print("State:", state)
        print("Action:", action)
        print("Total reward:", total_reward)
        
    if truncated:
        print("The environment was truncated even though a terminal state was not reached.")
    elif terminated:
        print("A terminal state was reached.")

In [90]:
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

In [91]:
env = gym.make('FrozenLake8x8-v1', render_mode="human")
state, info = env.reset()
print("State space:", env.observation_space)
print("Action space:", env.action_space)

State space: Discrete(64)
Action space: Discrete(4)


In [92]:
s = 0
a = RIGHT 
print(env.unwrapped.P[s][a])
for p, next_s, reward, _ in env.unwrapped.P[s][a]:
    print("With probability %.2f you will move to state %d and get reward %.1f." % (p, next_s, reward))

[(0.3333333333333333, 8, 0.0, False), (0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False)]
With probability 0.33 you will move to state 8 and get reward 0.0.
With probability 0.33 you will move to state 1 and get reward 0.0.
With probability 0.33 you will move to state 0 and get reward 0.0.


In [93]:
agent = RandomAgent()
run_agent(env, agent)

Time step: 24
State: 19
Action: 2
Total reward: 0.0
A terminal state was reached.


In [94]:
env.close()

In [95]:
def compute_action_value(env, discount, s, a, v):
    
    action_value = 0
    
    # Loop through all possible (s', r) pairs
    for p, next_s, reward, _ in env.unwrapped.P[s][a]:
        action_value += p * (reward + discount*v[next_s]) 
    
    return action_value

In [96]:
def Bellman_RHS(env, discount, agent, s, v):
    
    state_value = 0
    
    for a in range(env.action_space.n):
        # Loop through all possible actions
        state_value += agent.probs[s][a]*compute_action_value(env, discount, s, a, v) 
    
    return state_value

In [97]:
def Bellman_RHS_all(env, discount, agent, v0):
    # v0 is the given value function
    # v will be the right-hand side of the Bellman equation
    # If v0 is indeed the value function, then we should get v = v0.
    
    v = np.zeros(env.observation_space.n)
    
    for s in range(env.observation_space.n):
        v[s] = Bellman_RHS(env, discount, agent, s, v0)
    
    return v

In [98]:
def policy_evaluation(env, discount, agent, v0, max_iter=10000, tol=1e-6):
    
    v_old = v0
    
    for i in range(max_iter):
        v_new = Bellman_RHS_all(env, discount, agent, v_old)
        
        if np.max(np.abs(v_new-v_old)) < tol:
            break
            
        v_old = v_new
        
    return v_new

In [99]:
env = gym.make('FrozenLake8x8-v1')
agent = RandomAgent()
discount = 1
# Write code for computing the state-value function
v0 = np.zeros(env.observation_space.n) 
v = policy_evaluation(env, discount, agent, v0) 
print(v.reshape(8,8))

[[1.88195473e-03 2.14943930e-03 2.79687854e-03 4.10390763e-03
  6.53056742e-03 9.78455641e-03 1.34273647e-02 1.59485493e-02]
 [1.61810514e-03 1.77291032e-03 2.14038650e-03 2.98712882e-03
  5.70621601e-03 9.39901211e-03 1.45525909e-02 1.84735439e-02]
 [1.20269457e-03 1.18668645e-03 1.00712544e-03 0.00000000e+00
  3.91044692e-03 7.55535622e-03 1.69135293e-02 2.49228339e-02]
 [8.05878684e-04 7.66260403e-04 7.02931249e-04 7.70972167e-04
  2.38133947e-03 0.00000000e+00 2.06255200e-02 3.93839677e-02]
 [4.50526176e-04 3.71089481e-04 2.68385702e-04 0.00000000e+00
  4.84438436e-03 1.15928695e-02 2.62057426e-02 7.26051893e-02]
 [1.75712022e-04 0.00000000e+00 0.00000000e+00 1.44822264e-03
  5.40352770e-03 1.53215855e-02 0.00000000e+00 1.52226788e-01]
 [7.70749990e-05 0.00000000e+00 1.09331519e-04 3.89386428e-04
  0.00000000e+00 4.42900346e-02 0.00000000e+00 3.84075551e-01]
 [5.57306603e-05 3.45388241e-05 4.79484018e-05 0.00000000e+00
  5.39461917e-02 1.61838580e-01 3.87279525e-01 0.00000000e+00]]

In [100]:
def policy_evaluation_ip(env, discount, agent, v0, max_iter=10000, tol=1e-6):
    
    v = v0
    
    for i in range(max_iter): # Loop
        delta = 0
        for s in range(env.observation_space.n):
            vs = v[s]
            
            v[s] = Bellman_RHS(env, discount, agent, s, v) 
            
            delta = np.max([delta, np.abs(vs-v[s])])
            
        if (delta < tol): # Until delta < tol
            break
            
    return v    

In [101]:
def greedy_policy(env, discount, agent, v):
    
    # The new policy will be a_probs
    # We start by setting all probabilities to 0
    # Then when we have found the greedy action in a state, 
    # we change the probability for that action to 1.0.
    
    a_probs = np.zeros((env.observation_space.n, env.action_space.n)) 
    
    for s in range(env.observation_space.n):
        
        action_values = np.zeros(env.action_space.n)
        
        for a in range(env.action_space.n):
            # Compute action value for all actions
            action_values[a] = compute_action_value(env, discount, s, a, v)
            
        a_max = np.argmax(action_values) # A greedy action
        a_probs[s][a_max] = 1.0 # Always choose a greedy action!
        
    return a_probs

In [102]:
env = gym.make('FrozenLake8x8-v1', render_mode='human')
agent = RandomAgent()
discount = 1
# Enter code here
v_old = np.zeros(env.observation_space.n) 
for i in range(10000): 
    v = policy_evaluation(env, discount, agent, v_old) 
    
    if (np.max(np.abs(v-v_old))<1e-6): 
        break 
        
    v_old = v 
    agent.probs = greedy_policy(env, discount, agent, v) 

print(v.reshape(8,8)) 

[[0.9999935  0.99999455 0.99999585 0.99999711 0.99999821 0.99999907
  0.99999967 1.        ]
 [0.99999327 0.99999406 0.99999522 0.99999648 0.99999765 0.99999863
  0.9999994  1.        ]
 [0.99998212 0.97818546 0.92641739 0.         0.85661389 0.9462295
  0.98207622 1.        ]
 [0.99997229 0.93458144 0.80107253 0.47489567 0.62361545 0.
  0.94467658 1.        ]
 [0.99996407 0.82558801 0.5422199  0.         0.53933722 0.61118521
  0.85195361 1.        ]
 [0.9999577  0.         0.         0.16803749 0.38321317 0.44226508
  0.         1.        ]
 [0.99995335 0.         0.19466372 0.12090033 0.         0.33239888
  0.         1.        ]
 [0.99995114 0.73152044 0.46309149 0.         0.27746574 0.5549319
  0.77746574 0.        ]]


In [103]:
run_agent(env, agent)

Time step: 200
State: 15
Action: 2
Total reward: 0.0
The environment was truncated even though a terminal state was not reached.


In [104]:
def value_iteration(env, discount, agent, v0, max_iter=10000, tol=1e-6):
    
    v = v0
    
    for i in range(max_iter): # Loop
        delta = 0
        for s in range(env.observation_space.n):
            vs = v[s]
            
            action_values = np.zeros(env.action_space.n) 
            
            for a in range(env.action_space.n): 
                action_values[a] = compute_action_value(env, discount, s, a, v) 
            
            v[s] = np.max(action_values) 
            
            delta = np.max([delta, np.abs(vs-v[s])])
            
        if (delta < tol): # Until delta < tol
            break
            
    print(i) 
            
    return v    

In [105]:
env = gym.make('FrozenLake8x8-v1', render_mode="human")
agent = RandomAgent()
discount = 1

v0 = np.zeros(env.observation_space.n) 
v = value_iteration(env, discount, agent, v0) 
agent.probs = greedy_policy(env, discount, agent, v) 
print(v.reshape(8,8)) 
v

575
[[0.99998912 0.99998973 0.99999042 0.99999114 0.99999184 0.9999925
  0.99999308 0.99999351]
 [0.99998918 0.99998967 0.99999029 0.99999098 0.99999168 0.99999237
  0.99999308 0.99999391]
 [0.99997942 0.97818294 0.92641478 0.         0.85660997 0.94622427
  0.98207059 0.99999454]
 [0.99997068 0.93458025 0.80107148 0.4748952  0.62361428 0.
  0.94467234 0.99999539]
 [0.99996331 0.82558766 0.54221971 0.         0.53933775 0.61118505
  0.85195124 0.9999964 ]
 [0.99995756 0.         0.         0.16803827 0.38321409 0.44226632
  0.         0.99999754]
 [0.99995363 0.         0.19466435 0.12090087 0.         0.33239983
  0.         0.99999876]
 [0.99995164 0.73152158 0.46309273 0.         0.27746659 0.5549332
  0.7774666  0.        ]]


array([0.99998912, 0.99998973, 0.99999042, 0.99999114, 0.99999184,
       0.9999925 , 0.99999308, 0.99999351, 0.99998918, 0.99998967,
       0.99999029, 0.99999098, 0.99999168, 0.99999237, 0.99999308,
       0.99999391, 0.99997942, 0.97818294, 0.92641478, 0.        ,
       0.85660997, 0.94622427, 0.98207059, 0.99999454, 0.99997068,
       0.93458025, 0.80107148, 0.4748952 , 0.62361428, 0.        ,
       0.94467234, 0.99999539, 0.99996331, 0.82558766, 0.54221971,
       0.        , 0.53933775, 0.61118505, 0.85195124, 0.9999964 ,
       0.99995756, 0.        , 0.        , 0.16803827, 0.38321409,
       0.44226632, 0.        , 0.99999754, 0.99995363, 0.        ,
       0.19466435, 0.12090087, 0.        , 0.33239983, 0.        ,
       0.99999876, 0.99995164, 0.73152158, 0.46309273, 0.        ,
       0.27746659, 0.5549332 , 0.7774666 , 0.        ])

In [106]:
run_agent(env, agent)

Time step: 96
State: 63
Action: 2
Total reward: 1.0
A terminal state was reached.
