In [1]:
from Frozen_Lake import FrozenLakeEnv

In [2]:
env = FrozenLakeEnv()

In [3]:
all_states = env.get_all_states()

In [4]:
for state in all_states:
    print(state, env.get_possible_actions(state))

(0, 0) ('left', 'down', 'right', 'up')
(0, 1) ('left', 'down', 'right', 'up')
(0, 2) ('left', 'down', 'right', 'up')
(0, 3) ('left', 'down', 'right', 'up')
(1, 0) ('left', 'down', 'right', 'up')
(1, 1) ()
(1, 2) ('left', 'down', 'right', 'up')
(1, 3) ()
(2, 0) ('left', 'down', 'right', 'up')
(2, 1) ('left', 'down', 'right', 'up')
(2, 2) ('left', 'down', 'right', 'up')
(2, 3) ()
(3, 0) ()
(3, 1) ('left', 'down', 'right', 'up')
(3, 2) ('left', 'down', 'right', 'up')
(3, 3) ()


In [5]:
for state in all_states:
    print(state, env.is_terminal(state))

(0, 0) False
(0, 1) False
(0, 2) False
(0, 3) False
(1, 0) False
(1, 1) True
(1, 2) False
(1, 3) True
(2, 0) False
(2, 1) False
(2, 2) False
(2, 3) True
(3, 0) True
(3, 1) False
(3, 2) False
(3, 3) True


In [6]:
def init_policy(env):
    policy = {}
    for state in env.get_all_states():
        policy[state] = {}
        for action in env.get_possible_actions(state):
            policy[state][action] = 1 / len(env.get_possible_actions(state))
    return policy

In [7]:
policy = init_policy(env)

In [8]:
def policy_evaluation_step(policy, values, gamma):
    q_values = get_q_values(values, gamma)
    new_values = {}
    for state in env.get_all_states():
        new_values[state] = 0
        for action in env.get_possible_actions(state):
            new_values[state] += policy[state][action] * q_values[state][action]
    return new_values

In [9]:
def init_values():
    return {state: 0 for state in env.get_all_states()}

In [10]:
values = init_values()

In [11]:
def get_q_values(values, gamma):
    q_values = {}
    for state in env.get_all_states():
        q_values[state] = {}
        for action in env.get_possible_actions(state):
            q_values[state][action] = 0
            for next_state in env.get_next_states(state, action):
                reward = env.get_reward(state, action, next_state)
                transition_prob = env.get_transition_prob(state, action, next_state)
                next_value = values[next_state]
                q_values[state][action] += reward + gamma * transition_prob  * next_value
                
    return q_values

In [12]:
values = policy_evaluation_step(policy, values, gamma=0.9)
values

{(0, 0): 0.0,
 (0, 1): 0.0,
 (0, 2): 0.0,
 (0, 3): 0.0,
 (1, 0): 0.0,
 (1, 1): 0,
 (1, 2): 0.0,
 (1, 3): 0,
 (2, 0): 0.0,
 (2, 1): 0.0,
 (2, 2): 0.0,
 (2, 3): 0,
 (3, 0): 0,
 (3, 1): 0.0,
 (3, 2): 0.75,
 (3, 3): 0}

In [13]:
def policy_evaluation(policy, gamma, evaluation_step_n):
    values = init_values()
    for _ in range(evaluation_step_n):
        values = policy_evaluation_step(policy, values, gamma)
    q_values = get_q_values(values, gamma)
    return q_values

In [14]:
q_values = policy_evaluation(policy, gamma=0.9, evaluation_step_n=100)
q_values

{(0, 0): {'left': 0.01269467224209061,
  'down': 0.01686835383364247,
  'right': 0.012144295423787095,
  'up': 0.012019806754968413},
 (0, 1): {'left': 0.010810946369242779,
  'down': 0.003926884642953189,
  'right': 0.02288425734125813,
  'up': 0.013047390910454756},
 (0, 2): {'left': 0.01894863177964506,
  'down': 0.05913279233708525,
  'right': 0.018723477626703882,
  'up': 0.02399617635557818},
 (0, 3): {'left': 0.022856113072140482,
  'down': 0.0038299432715476027,
  'right': 0.010007271128880437,
  'up': 0.012725295386107992},
 (1, 0): {'left': 0.020770851485297476,
  'down': 0.04215541625130152,
  'right': 0.006251421320818339,
  'up': 0.011485811856364976},
 (1, 1): {},
 (1, 2): {'left': 0.031600450021849,
  'down': 0.23105940611697157,
  'right': 0.031600450021849,
  'up': 0.021744194057820434},
 (1, 3): {},
 (2, 0): {'left': 0.04215541625130152,
  'down': 0.020596453163186004,
  'right': 0.1262460665953063,
  'up': 0.03511588332766514},
 (2, 1): {'left': 0.07554391068366469,


In [15]:
def policy_improvement(q_values):
    new_policy = {}
    for state in env.get_all_states():
        new_policy[state] = {}
        max_action = None
        max_q_value = float('-inf')
        for action in env.get_possible_actions(state):
            if q_values[state][action] > max_q_value:
                max_q_value = q_values[state][action]
                max_action = action
        for action in env.get_possible_actions(state):
            new_policy[state][action] = 1 if action == max_action else 0
    return new_policy

In [16]:
policy_improvement(q_values)

{(0, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (0, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 3): {'left': 1, 'down': 0, 'right': 0, 'up': 0},
 (1, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 1): {},
 (1, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 3): {},
 (2, 0): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (2, 1): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 3): {},
 (3, 0): {},
 (3, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (3, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (3, 3): {}}

In [17]:
epochs = 20
evaluation_step_n = 20
gamma = 0.9

policy = init_policy(env)
for epoch in range(epochs):
    q_values = policy_evaluation(policy, gamma, evaluation_step_n)
    policy = policy_improvement(q_values)

policy

{(0, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (0, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 3): {'left': 1, 'down': 0, 'right': 0, 'up': 0},
 (1, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 1): {},
 (1, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 3): {},
 (2, 0): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (2, 1): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 3): {},
 (3, 0): {},
 (3, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (3, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (3, 3): {}}

In [18]:
import numpy as np
import time


total_reward = 0

state = env.reset()
for _ in range(100):
    action = np.random.choice(env.get_possible_actions(state), p=list(policy[state].values()))
    state, reward, done, _ = env.step(action)
    env.render()
    time.sleep(0.5)
    total_reward += reward
    
    if done:
        break
    
print(total_reward)

SFFF
*HFH
FFFH
HFFG

SFFF
FHFH
*FFH
HFFG

SFFF
FHFH
F*FH
HFFG

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
F*FH
HFFG

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HFF*

1.0
