In [1]:
from Frozen_Lake import FrozenLakeEnv
import numpy as np
import time
from tqdm.notebook import tqdm

In [2]:
env = FrozenLakeEnv()
print(env.counter)

0


In [3]:
states_n = len(env.get_all_states())
total_actions_n = 4

states_n, total_actions_n

(16, 4)

In [4]:
def init_values(env):
    return {state: 0 for state in env.get_all_states()}

values = init_values(env)
values

{(0, 0): 0,
 (0, 1): 0,
 (0, 2): 0,
 (0, 3): 0,
 (1, 0): 0,
 (1, 1): 0,
 (1, 2): 0,
 (1, 3): 0,
 (2, 0): 0,
 (2, 1): 0,
 (2, 2): 0,
 (2, 3): 0,
 (3, 0): 0,
 (3, 1): 0,
 (3, 2): 0,
 (3, 3): 0}

iterate through actions per state:
    find 
    reward + gamma * sum(transition_prob * values_next_state)
    choose max
    

In [5]:
def value_function(env, values, gamma):
    state_action_dict = {}
    for state in env.get_all_states():
        state_action_dict[state] = {}
        for action in env.get_possible_actions(state):
            state_action_dict[state][action] = 0
            for next_state in env.get_next_states(state, action):
                reward = env.get_reward(state, action, next_state)
                transition_prob = env.get_transition_prob(state, action, next_state)
                next_value = values[next_state]
                state_action_dict[state][action] += transition_prob * next_value
                state_action_dict[state][action] += reward
            state_action_dict[state][action] *= gamma
        
        if state_action_dict[state]:
            max_action_value = max(state_action_dict[state], key=state_action_dict[state].get)
            values[state] = state_action_dict[state][max_action_value]

    return state_action_dict

In [6]:
def value_iteration(n_iterations, env, gamma):
    values = init_values(env)
    
    for iteration in tqdm(range(n_iterations)):
        state_action_dict = value_function(env, values, gamma)
        print(f'{iteration = } \t {env.counter = }')
    policy = {}
    for state in env.get_all_states():
        if state_action_dict[state]:
            max_action_value = max(state_action_dict[state], key=state_action_dict[state].get)
            policy[state] = max_action_value
    
    return policy

In [7]:
def evaluate(env, policy, vizualize=False):
    state = env.reset()
    for _ in range(100):
        action = policy[state]
        state, reward, done, _ = env.step(action)
        if vizualize:
            env.render()
        if done:
            break
    return reward

In [8]:
#n_iterations = states_n ** 2 * total_actions_n
n_iterations = 4
gamma = 0.9

policy = value_iteration(n_iterations, env, gamma)

  0%|          | 0/4 [00:00<?, ?it/s]

iteration = 0 	 env.counter = 748
iteration = 1 	 env.counter = 1493
iteration = 2 	 env.counter = 2238
iteration = 3 	 env.counter = 2983


In [9]:
print(env.counter, 'for value iteration')

2984 for value iteration


In [10]:
evaluate(env, policy)

1.0

In [11]:
rewards = [evaluate(env, policy) for _ in range(1000)]
mean_reward = sum(rewards) / len(rewards)
print(mean_reward)

0.818
