In [37]:
from Frozen_Lake import FrozenLakeEnv

In [38]:
env = FrozenLakeEnv()
print(env.counter)

0


In [27]:
all_states = env.get_all_states()

In [28]:
def init_policy(env):
    policy = {}
    for state in env.get_all_states():
        policy[state] = {}
        for action in env.get_possible_actions(state):
            policy[state][action] = 1 / len(env.get_possible_actions(state))
    return policy

In [29]:
def policy_evaluation_step(policy, values, gamma):
    q_values = get_q_values(values, gamma)
    new_values = {}
    for state in env.get_all_states():
        new_values[state] = 0
        for action in env.get_possible_actions(state):
            new_values[state] += policy[state][action] * q_values[state][action]
    return new_values

In [30]:
def init_values():
    return {state: 0 for state in env.get_all_states()}

In [31]:
def get_q_values(values, gamma):
    q_values = {}
    for state in env.get_all_states():
        q_values[state] = {}
        for action in env.get_possible_actions(state):
            q_values[state][action] = 0
            for next_state in env.get_next_states(state, action):
                reward = env.get_reward(state, action, next_state)
                transition_prob = env.get_transition_prob(state, action, next_state)
                next_value = values[next_state]
                q_values[state][action] += reward + gamma * transition_prob  * next_value
                
    return q_values

In [32]:
def policy_evaluation(policy, gamma, evaluation_step_n):
    values = init_values()
    for _ in range(evaluation_step_n):
        values = policy_evaluation_step(policy, values, gamma)
    q_values = get_q_values(values, gamma)
    return q_values

In [33]:
def policy_improvement(q_values):
    new_policy = {}
    for state in env.get_all_states():
        new_policy[state] = {}
        max_action = None
        max_q_value = float('-inf')
        for action in env.get_possible_actions(state):
            if q_values[state][action] > max_q_value:
                max_q_value = q_values[state][action]
                max_action = action
        for action in env.get_possible_actions(state):
            new_policy[state][action] = 1 if action == max_action else 0
    return new_policy

In [34]:
print(env.counter)

1


In [39]:
epochs = 20
evaluation_step_n = 20
gamma = 0.9

policy = init_policy(env)
values = init_values()
for epoch in range(epochs):
    q_values = policy_evaluation(policy, gamma, evaluation_step_n)
    policy = policy_improvement(q_values)
    print(f'{epoch = } \t {env.counter = }')
policy

epoch = 0 	 env.counter = 16081
epoch = 1 	 env.counter = 32100
epoch = 2 	 env.counter = 48119
epoch = 3 	 env.counter = 64138
epoch = 4 	 env.counter = 80157
epoch = 5 	 env.counter = 96176
epoch = 6 	 env.counter = 112195
epoch = 7 	 env.counter = 128214
epoch = 8 	 env.counter = 144233
epoch = 9 	 env.counter = 160252
epoch = 10 	 env.counter = 176271
epoch = 11 	 env.counter = 192290
epoch = 12 	 env.counter = 208309
epoch = 13 	 env.counter = 224328
epoch = 14 	 env.counter = 240347
epoch = 15 	 env.counter = 256366
epoch = 16 	 env.counter = 272385
epoch = 17 	 env.counter = 288404
epoch = 18 	 env.counter = 304423
epoch = 19 	 env.counter = 320442


{(0, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (0, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 3): {'left': 1, 'down': 0, 'right': 0, 'up': 0},
 (1, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 1): {},
 (1, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 3): {},
 (2, 0): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (2, 1): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 3): {},
 (3, 0): {},
 (3, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (3, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (3, 3): {}}

In [12]:
print(env.counter, 'for policy iteration')

320443 for policy iteration


In [18]:
import numpy as np
import time


total_reward = 0

state = env.reset()
for _ in range(100):
    action = np.random.choice(env.get_possible_actions(state), p=list(policy[state].values()))
    state, reward, done, _ = env.step(action)
    env.render()
    time.sleep(0.5)
    total_reward += reward
    
    if done:
        break
    
print(total_reward)

SFFF
*HFH
FFFH
HFFG

SFFF
FHFH
*FFH
HFFG

SFFF
FHFH
F*FH
HFFG

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
F*FH
HFFG

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HFF*

1.0
